[Executor] CUDA Graph support padding batch (#2844)

* cuda graph support padding batch

* Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes.

* Do not insert max_num_seqs when the user specifies a capture list

* Support set graph optimization config from YAML file

* update cuda graph ci

* fix ci bug

* fix ci bug
This commit is contained in:
RAM
2025-07-16 10:49:01 +08:00
committed by GitHub
parent 61b3997b85
commit 0fad10b35a
30 changed files with 291 additions and 225 deletions

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -2,4 +2,5 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1

View File

@@ -3,4 +3,5 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True
graph_optimization_config:
graph_opt_level: 1