mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Executor] Default use CUDAGraph (#3594)
* add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml
This commit is contained in:
@@ -354,10 +354,6 @@ class EngineArgs:
|
||||
"""
|
||||
SplitWise Use, Results Writer Batch Size
|
||||
"""
|
||||
use_cudagraph: bool = False
|
||||
"""
|
||||
Flags to enable Cuda Graph
|
||||
"""
|
||||
graph_optimization_config: Optional[Dict[str, Any]] = None
|
||||
"""
|
||||
Configuration for graph optimization backend execution.
|
||||
@@ -586,17 +582,11 @@ class EngineArgs:
|
||||
"is lower than that of the config file. "
|
||||
"More complex quantization methods need to be configured via the config file.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--use-cudagraph",
|
||||
action="store_true",
|
||||
default=EngineArgs.use_cudagraph,
|
||||
help="Flags to enable cuda graph.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--graph-optimization-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.graph_optimization_config,
|
||||
help="",
|
||||
help="Configuration for graph optimization",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--plas-attention-config",
|
||||
@@ -1057,7 +1047,6 @@ class EngineArgs:
|
||||
parallel_cfg = ParallelConfig(all_dict)
|
||||
scheduler_cfg = self.create_scheduler_config()
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
||||
plas_attention_config = self.create_plas_attention_config()
|
||||
|
||||
early_stop_cfg = self.create_early_stop_config()
|
||||
|
||||
Reference in New Issue
Block a user