From 775edcc09ac34a9f64762832cda3b975c6af8d2a Mon Sep 17 00:00:00 2001 From: RAM Date: Tue, 21 Oct 2025 14:25:45 +0800 Subject: [PATCH] [Executor] Default use CUDAGraph (#3594) * add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml --- benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml | 2 +- benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml | 2 +- benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml | 4 +- custom_ops/gpu_ops/helper.cu | 97 +++++++++++++ custom_ops/gpu_ops/helper.h | 29 ++++ custom_ops/setup_ops.py | 3 +- docs/best_practices/ERNIE-4.5-0.3B-Paddle.md | 7 +- .../ERNIE-4.5-21B-A3B-Paddle.md | 6 +- .../ERNIE-4.5-300B-A47B-Paddle.md | 6 +- docs/features/graph_optimization.md | 11 +- docs/parameters.md | 6 +- .../best_practices/ERNIE-4.5-0.3B-Paddle.md | 7 +- .../ERNIE-4.5-21B-A3B-Paddle.md | 7 +- .../ERNIE-4.5-300B-A47B-Paddle.md | 7 +- docs/zh/features/graph_optimization.md | 13 +- docs/zh/parameters.md | 6 +- fastdeploy/config.py | 55 +++++--- fastdeploy/engine/args_utils.py | 13 +- .../graph_optimization/utils.py | 94 +++++++++++++ fastdeploy/utils.py | 4 +- fastdeploy/worker/gpu_model_runner.py | 130 +++++++++++------- tests/ce/deploy/21b_sot.yaml | 2 +- tests/ci_use/EB_Lite/test_EB_Lite_serving.py | 1 - tests/conftest.py | 5 + tests/e2e/test_DeepSeek_V3_5layers_serving.py | 3 +- tests/e2e/test_EB_Lite_serving.py | 3 +- tests/layers/test_plas_attention.py | 2 + tests/model_loader/test_common_model.py | 12 +- tests/model_loader/test_model_cache.py | 8 +- tests/model_loader/test_offline_model.py | 7 +- tests/model_loader/test_torch_model.py | 7 +- tests/model_loader/utils.py | 2 + 32 files changed, 417 insertions(+), 144 deletions(-) diff --git a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml index c609fba49..974c2eaf7 100644 --- a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml @@ -1,6 +1,6 @@ max_model_len: 32768 max_num_seqs: 96 -gpu_memory_utilization: 0.9 +gpu_memory_utilization: 0.85 kv_cache_ratio: 0.71 tensor_parallel_size: 4 quantization: wint4 diff --git a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml index 2a8fea90f..86e08d343 100644 --- a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml +++ b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml @@ -1,6 +1,6 @@ max_model_len: 32768 max_num_seqs: 96 -gpu_memory_utilization: 0.9 +gpu_memory_utilization: 0.85 kv_cache_ratio: 0.71 tensor_parallel_size: 8 quantization: wint8 diff --git a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml index c77231c8f..30a50170b 100644 --- a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml +++ b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml @@ -6,5 +6,5 @@ max_num_seqs: 128 enable_prefix_caching: True enable_chunked_prefill: True gpu_memory_utilization: 0.85 -use_cudagraph: True -enable_custom_all_reduce: True +graph_optimization_config: + use_cudagraph: True diff --git a/custom_ops/gpu_ops/helper.cu b/custom_ops/gpu_ops/helper.cu index 5574daee4..bb9dd88da 100644 --- a/custom_ops/gpu_ops/helper.cu +++ b/custom_ops/gpu_ops/helper.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "helper.h" +#include float bfloat16_to_float(__nv_bfloat16 x) { uint32_t tmp_x = *(reinterpret_cast(&x)); @@ -47,3 +48,99 @@ static void PrintMatrix(const T* mat_d, outfile << ss.str(); outfile.close(); } + +GPUMemoryChecker::GPUMemoryChecker() { + nvmlReturn_t result = nvmlInit_v2(); + if (NVML_SUCCESS != result) { + throw std::runtime_error("Failed to initialize NVML: " + + std::string(nvmlErrorString(result))); + } + + result = nvmlDeviceGetCount_v2(&deviceCount_); + if (NVML_SUCCESS != result) { + nvmlShutdown(); + throw std::runtime_error("Failed to get GPU count: " + + std::string(nvmlErrorString(result))); + } + + getCUDAVisibleDevice(); +} + +GPUMemoryChecker::~GPUMemoryChecker() { + nvmlShutdown(); +} + +void GPUMemoryChecker::getCUDAVisibleDevice(){ + std::vector devices; + const char* env_p = std::getenv("CUDA_VISIBLE_DEVICES"); + if(!env_p){ + for(int i = 0; i < deviceCount_; i++){ + visible_device_.push_back(i); + return ; + } + } + + std::string env_str(env_p); + std::istringstream stream(env_str); + std::string device_id; + + while(std::getline(stream, device_id, ',')){ + visible_device_.push_back(std::stoi(device_id)); + visible_device_mem_usage_.push_back(-1); + } + std::cout << "\nVisible NVIDIA GPU devices" << env_str << std::endl; + return ; +} + +void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) { + try { + + + for (int i = 0; i < visible_device_.size(); i++) { + unsigned int device_id = visible_device_.at(i); + nvmlDevice_t device; + nvmlReturn_t result = nvmlDeviceGetHandleByIndex_v2(device_id, &device); + if (NVML_SUCCESS != result) { + std::cerr << "Failed to get handle for GPU " << device_id << ": " + << nvmlErrorString(result) << std::endl; + continue; + } + + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); + if (NVML_SUCCESS != result) { + std::cerr << "Failed to get name for GPU " << device_id << ": " + << nvmlErrorString(result) << std::endl; + continue; + } + + nvmlMemory_t memoryInfo; + result = nvmlDeviceGetMemoryInfo(device, &memoryInfo); + if (NVML_SUCCESS != result) { + std::cerr << "Failed to get memory info for GPU " << device_id << ": " + << nvmlErrorString(result) << std::endl; + continue; + } + + // Check GPU memory + const char* env_c = std::getenv("MEMCHECKER_CHECK_MEMORY"); + if (env_c){ + assert(memoryInfo.used <= visible_device_mem_usage_.at(i) && "GPU Memory does not allow growth!"); + } + visible_device_mem_usage_[i] = memoryInfo.used; + } + + // Check GPU memory + const char* env_p = std::getenv("MEMCHECKER_PRINT_MEMORY"); + if (env_p){ + std::cout << "\nCall Line: "<< call_line << "\t"; + for (int i = 0; i < visible_device_.size(); i++) { + unsigned int device_id = visible_device_.at(i); + std::cout << "GPU " << device_id << ": " + << " Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t"; + } + } + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + } +} diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 6f6554f03..71c9e052a 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -27,6 +27,9 @@ #include #include #include +#include +#include +#include #include #include @@ -618,6 +621,32 @@ inline bool checkAttentionBackend() { return false; } +#ifndef GPU_MEMORY_CHECKER_H +#define GPU_MEMORY_CHECKER_H +class GPUMemoryChecker { +public: + static GPUMemoryChecker* getInstance() { + static GPUMemoryChecker instance; + return &instance; + } + + void addCheckPoint(const char* call_file, int call_line); + unsigned int getGPUCount() const { return deviceCount_; } + void getCUDAVisibleDevice(); + + GPUMemoryChecker(const GPUMemoryChecker&) = delete; + void operator=(const GPUMemoryChecker&) = delete; + +private: + GPUMemoryChecker(); + ~GPUMemoryChecker(); + + unsigned int deviceCount_; + std::vector visible_device_; + std::vector visible_device_mem_usage_; +}; + +#endif // GPU_MEMORY_CHECKER_H __device__ __forceinline__ float warpReduceMax(float value) { value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16)); value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index d1d06e9c2..366ce8243 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -251,6 +251,7 @@ if paddle.is_compiled_with_rocm(): ) elif paddle.is_compiled_with_cuda(): sources = [ + "gpu_ops/helper.cu", "gpu_ops/save_with_output_msg.cc", "gpu_ops/get_output.cc", "gpu_ops/get_output_msg_with_topk.cc", @@ -499,7 +500,7 @@ elif paddle.is_compiled_with_cuda(): sources=sources, extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args}, libraries=["cublasLt"], - extra_link_args=["-lcuda"], + extra_link_args=["-lcuda", "-lnvidia-ml"], ), packages=find_packages(where="third_party/DeepGEMM"), package_dir={"": "third_party/DeepGEMM"}, diff --git a/docs/best_practices/ERNIE-4.5-0.3B-Paddle.md b/docs/best_practices/ERNIE-4.5-0.3B-Paddle.md index fc7158127..5644f3840 100644 --- a/docs/best_practices/ERNIE-4.5-0.3B-Paddle.md +++ b/docs/best_practices/ERNIE-4.5-0.3B-Paddle.md @@ -74,12 +74,9 @@ For versions 2.1 and earlier, you need to enable it manually by adding CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance. **How to enable:** -Add the following lines to the startup parameters -``` ---use-cudagraph -``` +Before version 2.3, it needs to be enabled through `--use-cudagraph`. +CUDAGraph has been enabled by default in some scenarios at the beginning of version 2.3. CUDAGraph will be automatically closed for functions that are not compatible with CUDAGraph (speculative decoding, RL training, multi-mode model). Notes: - - Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions #### 2.2.5 Rejection Sampling diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md index 4185a3041..071dbe8e2 100644 --- a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md +++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md @@ -90,10 +90,8 @@ Notes: CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance. **How to enable:** -Add the following lines to the startup parameters -``` ---use-cudagraph -``` +Before version 2.3, it needs to be enabled through `--use-cudagraph`. +CUDAGraph has been enabled by default in some scenarios at the beginning of version 2.3. CUDAGraph will be automatically closed for functions that are not compatible with CUDAGraph (speculative decoding, RL training, multi-mode model). Notes: - Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions diff --git a/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md b/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md index 13668f8b9..f658477e2 100644 --- a/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md +++ b/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md @@ -141,10 +141,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance. **How to enable:** -Add the following lines to the startup parameters -``` ---use-cudagraph -``` +Before version 2.3, it needs to be enabled through `--use-cudagraph`. +CUDAGraph has been enabled by default in some scenarios at the beginning of version 2.3. CUDAGraph will be automatically closed for functions that are not compatible with CUDAGraph (speculative decoding, RL training, multi-mode model). Notes: - Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions diff --git a/docs/features/graph_optimization.md b/docs/features/graph_optimization.md index 78198333b..ae966f18d 100644 --- a/docs/features/graph_optimization.md +++ b/docs/features/graph_optimization.md @@ -20,7 +20,7 @@ FastDeploy's `GraphOptimizationBackend` design architecture is as follows, **som ## 1. GraphOptimizationBackend Current usage restrictions In the CUDAGraph multi-device inference task, you need to use the Custom all-reduce operator to perform multi-card all-reduce. -Before version 2.2, the CUDAGraph was not enabled by default. the Custom all-reduce operators was enabled by default. +Before version 2.3, CUDAGraph and Custom all reduce were not enabled by default. Since version 2.3, CUDAGraph and Custom all reduce have been enabled by default. ### 1.1 The multi-device scene needs to be enabled Custom all-reduce The `FLAGS_max_partition_size` environment variable controls the `gridDim` execution configuration of Kernel in CascadeAppend Attention, and dynamic execution configuration will cause CUDAGraph execution to fail. @@ -35,13 +35,12 @@ The `FLAGS_max_partition_size` environment variable controls the `gridDim` execu ## 2. GraphOptimizationBackend related configuration parameters Currently, only user configuration of the following parameters is supported: -+ `use_cudagraph` : bool = False -+ `graph_optimization_config` : Dict[str, Any] ++ `graph-optimization-config` : Dict[str, Any] + `graph_opt_level`: int = 0 - + `use_cudagraph`: bool = False - + `cudagraph_capture_sizes` : List[int] = None + + `use_cudagraph`: bool = True + + `cudagraph_capture_sizes` : List[int] -CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts. +Before version 2.3, it needs to be enabled through `--use-cudagraph`.CUDAGraph has been enabled by default in some scenarios at the beginning of version 2.3. CUDAGraph will be automatically closed for functions that are not compatible with CUDAGraph (speculative decoding, multi-mode model).You can also manually control the CUDAGraph by setting `--graph-optimization-config` . The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options: + `0`: Use Dynamic compute graph, default to 0 diff --git a/docs/parameters.md b/docs/parameters.md index e5466e74a..3aa27f1c7 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -37,9 +37,9 @@ When using FastDeploy to deploy models (including offline inference and service | ```long_prefill_token_threshold``` | `int` | When Chunked Prefill is enabled, requests with token count exceeding this value are considered long requests, default: max_model_len*0.04 | | ```static_decode_blocks``` | `int` | During inference, each request is forced to allocate corresponding number of blocks from Prefill's KVCache for Decode use, default: 2 | | ```reasoning_parser``` | `str` | Specify the reasoning parser to extract reasoning content from model output | -| ```use_cudagraph``` | `bool` | Whether to use cuda graph, default False. It is recommended to read [graph_optimization.md](./features/graph_optimization.md) carefully before opening. Custom all-reduce needs to be enabled at the same time in multi-card scenarios. | -| ```graph_optimization_config``` | `dict[str]` | Can configure parameters related to calculation graph optimization, the default value is'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }',Detailed description reference [graph_optimization.md](./features/graph_optimization.md)| -| ```enable_custom_all_reduce``` | `bool` | Enable Custom all-reduce, default: False | +| ```use_cudagraph``` | `bool` | __[DEPRECATED]__ CUDAGraph is enabled by default since version 2.3. It is recommended to read [graph_optimization.md](./features/graph_optimization.md) carefully before opening. | +| ```graph_optimization_config``` | `dict[str]` | Can configure parameters related to calculation graph optimization, the default value is'{"use_cudagraph":true, "graph_opt_level":0}',Detailed description reference [graph_optimization.md](./features/graph_optimization.md)| +| ```disable_custom_all_reduce``` | `bool` | Disable Custom all-reduce, default: False | | ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] | | ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None | | ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `off`, default: `off` | diff --git a/docs/zh/best_practices/ERNIE-4.5-0.3B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-0.3B-Paddle.md index 5c935cc71..869bf783f 100644 --- a/docs/zh/best_practices/ERNIE-4.5-0.3B-Paddle.md +++ b/docs/zh/best_practices/ERNIE-4.5-0.3B-Paddle.md @@ -75,10 +75,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操作序列捕获(capture)为图结构(graph),实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图,从而减少 CPU-GPU 通信开销、降低内核启动延迟,并提升整体计算性能。 **启用方式:** -在启动命令中增加 -``` ---use-cudagraph -``` +在2.3版本之前需要通过`--use-cudagraph`启用。 + +2.3版本开始部分场景已默认开启 CUDAGraph,对于暂时不能兼容 CUDAGraph 的功能(投机解码、强化学习训练、多模模型推理)CUDAGraph 会自动关闭。 注: - 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明 diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md index 61f6d27b6..54b528176 100644 --- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md +++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md @@ -92,10 +92,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操作序列捕获(capture)为图结构(graph),实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图,从而减少 CPU-GPU 通信开销、降低内核启动延迟,并提升整体计算性能。 **启用方式:** -在启动命令中增加 -``` ---use-cudagraph -``` +在2.3版本之前需要通过`--use-cudagraph`启用。 + +2.3版本开始部分场景已默认开启 CUDAGraph,对于暂时不能兼容 CUDAGraph 的功能(投机解码、强化学习训练、多模模型推理)CUDAGraph 会自动关闭。 注: - 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明 diff --git a/docs/zh/best_practices/ERNIE-4.5-300B-A47B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-300B-A47B-Paddle.md index 824af9e98..6cfe79aa2 100644 --- a/docs/zh/best_practices/ERNIE-4.5-300B-A47B-Paddle.md +++ b/docs/zh/best_practices/ERNIE-4.5-300B-A47B-Paddle.md @@ -142,10 +142,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操作序列捕获(capture)为图结构(graph),实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图,从而减少 CPU-GPU 通信开销、降低内核启动延迟,并提升整体计算性能。 **启用方式:** -在启动命令中增加 -``` ---use-cudagraph -``` +在2.3版本之前需要通过`--use-cudagraph`启用。 + +2.3版本开始部分场景已默认开启 CUDAGraph,对于暂时不能兼容 CUDAGraph 的功能(投机解码、强化学习训练、多模模型推理)CUDAGraph 会自动关闭。 注: - 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明 diff --git a/docs/zh/features/graph_optimization.md b/docs/zh/features/graph_optimization.md index 09e725625..c66dd350a 100644 --- a/docs/zh/features/graph_optimization.md +++ b/docs/zh/features/graph_optimization.md @@ -19,9 +19,9 @@ FastDeploy 的 `GraphOptimizationBackend` 设计架构如下,**部分功能仍 ## 1. GraphOptimizationBackend 当前使用限制 ### 1.1 多卡场景需要开启 Custom all-reduce -在 CUDAGraph 多卡推理任务中需要使用 Custom all-reduce 算子进行多卡 all-reduce, +在 CUDAGraph 多卡推理任务中需要使用 Custom all-reduce 算子进行多卡 all-reduce。 -在 2.2 版本之前,CUDAGraph 未默认开启,Custom all-reduce 算子默认开启。 +2.3 版本之前,CUDAGraph 和 Custom all-reduce 都未默认开启,2.3 版本开始已默认开启。 ### 1.2 FLAGS_max_partition_size 相关的 Kernel 的动态执行配置导致 CUDAGraph 执行失败 `FLAGS_max_partition_size` 环境变量控制了 CascadeAppend Attention 中 Kernel 的`gridDim` 执行配置 , 而动态的执行配置会导致 CUDAGraph 执行失败。 @@ -39,13 +39,12 @@ FastDeploy 的 `GraphOptimizationBackend` 设计架构如下,**部分功能仍 当前仅支持用户配置以下参数: -+ `use_cudagraph` : bool = False -+ `graph_optimization_config` : Dict[str, Any] ++ `graph-optimization-config` : Dict[str, Any] + `graph_opt_level`: int = 0 - + `use_cudagraph`: bool = False - + `cudagraph_capture_sizes` : List[int] = None + + `use_cudagraph`: bool = True + + `cudagraph_capture_sizes` : List[int] -可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。 +在2.3版本之前需要通过`--use-cudagraph`启用。2.3版本开始 CUDAGraph 已经默认开启,对于暂时不能兼容 CUDAGraph 的功能(投机解码、多模模型推理)CUDAGraph 会自动关闭。也可以通过设置 `--graph-optimization-config` 手动控制 CUDAGraph。 `--graph-optimization-config` 中的 `graph_opt_level` 参数用于配置图优化等级,可选项如下: diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md index 9599c04d6..397d31576 100644 --- a/docs/zh/parameters.md +++ b/docs/zh/parameters.md @@ -35,9 +35,9 @@ | ```long_prefill_token_threshold``` | `int` | 开启Chunked Prefill时,请求Token数超过此值的请求被视为长请求,默认为max_model_len*0.04 | | ```static_decode_blocks``` | `int` | 推理过程中,每条请求强制从Prefill的KVCache分配对应块数给Decode使用,默认2| | ```reasoning_parser``` | `str` | 指定要使用的推理解析器,以便从模型输出中提取推理内容 | -| ```use_cudagraph``` | `bool` | 是否使用cuda graph,默认False。开启前建议仔细阅读 [graph_optimization.md](./features/graph_optimization.md),在多卡场景需要同时开启 Custom all-reduce。 | -| ```graph_optimization_config``` | `dict[str]` | 可以配置计算图优化相关的参数,默认值为'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }',详细说明参考 [graph_optimization.md](./features/graph_optimization.md)| -| ```enable_custom_all_reduce``` | `bool` | 开启Custom all-reduce,默认False | +| ```use_cudagraph``` | `bool` | __[已废弃]__ 2.3版本开始 CUDAGraph 默认开启,详细说明参考 [graph_optimization.md](./features/graph_optimization.md) | +| ```graph_optimization_config``` | `dict[str]` | 可以配置计算图优化相关的参数,默认值为'{"use_cudagraph":true, "graph_opt_level":0}',详细说明参考 [graph_optimization.md](./features/graph_optimization.md)| +| ```disable_custom_all_reduce``` | `bool` | 关闭Custom all-reduce,默认False | | ```splitwise_role``` | `str` | 是否开启splitwise推理,默认值mixed, 支持参数为["mixed", "decode", "prefill"] | | ```innode_prefill_ports``` | `str` | prefill 实例内部引擎启动端口 (仅单机PD分离需要),默认值None | | ```guided_decoding_backend``` | `str` | 指定要使用的guided decoding后端,支持 `auto`、`xgrammar`、`off`, 默认为 `off` | diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 11f84fbd0..738f563ef 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -785,7 +785,7 @@ class GraphOptimizationConfig: """ self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128] """ Number of warmup runs for SOT warmup. """ - self.use_cudagraph: bool = False + self.use_cudagraph: bool = True """Sizes to capture cudagraph. - None (default): capture sizes are inferred from llm config. - list[int]: capture sizes are specified as given.""" @@ -821,7 +821,7 @@ class GraphOptimizationConfig: """ Record maps mapped from real shape to captured size to reduce runtime overhead """ self.real_shape_to_captured_size: dict[int, int] = None """ Whether to use shared memory pool for multi capture_size """ - self.use_unique_memory_pool: bool = False + self.use_unique_memory_pool: bool = True # CINN Config ... if args is not None: @@ -908,22 +908,6 @@ class GraphOptimizationConfig: len(self.cudagraph_capture_sizes) > 0 ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list." - def update_use_cudagraph(self, argument: bool): - """ - Unified user specifies the use_cudagraph parameter through two methods, - '--use-cudagraph' and '--graph-optimization-config' - """ - if self.use_cudagraph is None: - # User only set '--use-cudagraph' - self.use_cudagraph = argument - else: - # User both set '--use-cudagraph' and '--graph-optimization-config' - if self.use_cudagraph is False and argument is True: - raise ValueError( - "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously." - ) - argument = self.use_cudagraph - class PlasAttentionConfig: def __init__( @@ -1525,6 +1509,26 @@ class FDConfig: else: self.structured_outputs_config.guided_decoding_backend = "xgrammar" + # Adjustment GraphOptConfig + if ( + (self.speculative_config is not None and self.speculative_config.method is not None) + or (self.model_config is not None and self.model_config.enable_mm is True) + or (self.load_config is not None and self.load_config.dynamic_load_weight is True) + or (self.scheduler_config.splitwise_role != "mixed") + ): + self.graph_opt_config.use_cudagraph = False + logger.info( + "CUDAGraph does not support to be started together with SpeculativeDecode and MultiModel temporarily, but has been automatically closed!" + ) + if self.load_config is not None and self.load_config.dynamic_load_weight is True: + self.graph_opt_config.graph_opt_level = 0 + logger.info( + "Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!" + ) + if self.device_config is not None and self.device_config.device_type != "cuda": + self.graph_opt_config.use_cudagraph = False + logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!") + if self.scheduler_config.splitwise_role == "mixed": self.model_config.moe_phase = MoEPhase(phase="prefill") elif self.scheduler_config.splitwise_role == "prefill": @@ -1628,6 +1632,21 @@ class FDConfig: if self.scheduler_config is not None: self.scheduler_config.check() + # Check graph optimization config + if self.graph_opt_config.use_cudagraph: + if self.speculative_config is not None: + assert ( + self.speculative_config.method is None + ), "CUDAGraph does not support the simultaneous use of Speculative Decoding" + if self.model_config is not None: + assert ( + self.model_config.enable_mm is not True + ), "CUDAGraph cannot be applied to multimodal model temporarily" + if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph: + if self.load_config is not None: + assert ( + self.load_config.dynamic_load_weight is False + ), "Static graph cannot be used in RL scene temporarily" if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1: assert ( int(envs.FD_DISABLED_RECOVER) == 0 diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index bde7d2e9a..176683c98 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -354,10 +354,6 @@ class EngineArgs: """ SplitWise Use, Results Writer Batch Size """ - use_cudagraph: bool = False - """ - Flags to enable Cuda Graph - """ graph_optimization_config: Optional[Dict[str, Any]] = None """ Configuration for graph optimization backend execution. @@ -586,17 +582,11 @@ class EngineArgs: "is lower than that of the config file. " "More complex quantization methods need to be configured via the config file.", ) - model_group.add_argument( - "--use-cudagraph", - action="store_true", - default=EngineArgs.use_cudagraph, - help="Flags to enable cuda graph.", - ) model_group.add_argument( "--graph-optimization-config", type=json.loads, default=EngineArgs.graph_optimization_config, - help="", + help="Configuration for graph optimization", ) model_group.add_argument( "--plas-attention-config", @@ -1057,7 +1047,6 @@ class EngineArgs: parallel_cfg = ParallelConfig(all_dict) scheduler_cfg = self.create_scheduler_config() graph_opt_cfg = self.create_graph_optimization_config() - graph_opt_cfg.update_use_cudagraph(self.use_cudagraph) plas_attention_config = self.create_plas_attention_config() early_stop_cfg = self.create_early_stop_config() diff --git a/fastdeploy/model_executor/graph_optimization/utils.py b/fastdeploy/model_executor/graph_optimization/utils.py index 80e7dc0ed..4aa7729da 100644 --- a/fastdeploy/model_executor/graph_optimization/utils.py +++ b/fastdeploy/model_executor/graph_optimization/utils.py @@ -15,6 +15,100 @@ """ import contextlib +from dataclasses import dataclass + +import paddle +import pynvml + + +@dataclass +class PaddleMemoryInfo: + # Max memory reserved by Paddle + max_reserved: int = 0 + # Max memory allocated by Paddle + max_allocated: int = 0 + # Current memory reserved by Paddle + current_reserved: int = 0 + # Current memory allocated by Paddle + current_allocated: int = 0 + + +class GPUMemoryChecker: + def __init__( + self, + device: int = 0, # logic device id + device_id: int = 0, # physical device id + print_debug_info: bool = True, + ): + self.gpu_memory_info = None + self.paddle_memory_info = None + self.device = device + self.device_id = device_id + self.print_debug_info = print_debug_info + + pynvml.nvmlInit() + self.gpu_memory_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id) + + def __del__(self): + """ """ + pynvml.nvmlShutdown() + + def _print_memory_info( + self, + debug_title: str = "", + ): + """Print debug info""" + print( + f"\n{debug_title}:", + f"\n\tDevice Total memory: {self.gpu_memory_info.total}", + f"\n\tDevice Used memory: {self.gpu_memory_info.used}", + f"\n\tDevice Free memory: {self.gpu_memory_info.free}", + f"\n\tPaddle max memory Reserved: {self.paddle_memory_info.max_reserved}", + f"\n\tPaddle max memory Allocated: {self.paddle_memory_info.max_allocated}", + f"\n\tPaddle memory Reserved: {self.paddle_memory_info.current_reserved}", + f"\n\tPaddle memory Allocated: {self.paddle_memory_info.current_reserved}", + ) + + def get_gpu_memory_info(self): + """Get Device memory information""" + current_meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_memory_handle) + + return current_meminfo + + def get_paddle_memory_info(self) -> PaddleMemoryInfo: + """Get GPU memory information managed by Paddle""" + current_paddle_memory_info = PaddleMemoryInfo() + current_paddle_memory_info.max_reserved = paddle.device.cuda.max_memory_reserved(self.device) + current_paddle_memory_info.max_allocated = paddle.device.cuda.max_memory_allocated(self.device) + current_paddle_memory_info.reserved = paddle.device.cuda.memory_reserved(self.device) + current_paddle_memory_info.allocated = paddle.device.cuda.memory_allocated(self.device) + + return current_paddle_memory_info + + def _check_memory(self): + """Check current device memory usage with pre checkpoint""" + current_gpu_memory_info = self.get_gpu_memory_info() + current_paddle_memory_info = self.get_paddle_memory_info() + + if self.gpu_memory_info is not None and self.paddle_memory_info is not None: + assert ( + current_paddle_memory_info.max_reserved <= self.paddle_memory_info.max_reserved + ), f"Memory Check Failed! Current checkpoint Padddle memory usage ({current_paddle_memory_info.max_reserved}) must be less than or equal to the previous one ({self.paddle_memory_info.max_reserved})." + assert ( + current_gpu_memory_info.used <= self.gpu_memory_info.used + ), f"Memory Check Failed! Current checkpoint GPU memory usage ({current_gpu_memory_info.used}) must be less than or equal to the previous one ({self.gpu_memory_info.used})." + + self.gpu_memory_info = current_gpu_memory_info + self.paddle_memory_info = current_paddle_memory_info + + def add_check_point( + self, + debug_title: str = "", + ): + """Add checkpoints for GPU memory usage""" + self._check_memory() + if self.print_debug_info: + self._print_memory_info(debug_title) def create_guard(default_value): diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index f15ff4227..604efd1a1 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -56,7 +56,9 @@ from typing import Callable, Optional # [N,2] -> every line is [config_name, enable_xxx_name] # Make sure enable_xxx equal to config.enable_xxx -ARGS_CORRECTION_LIST = [["early_stop_config", "enable_early_stop"], ["graph_optimization_config", "use_cudagraph"]] +ARGS_CORRECTION_LIST = [ + ["early_stop_config", "enable_early_stop"], +] FASTDEPLOY_SUBCMD_PARSER_EPILOG = ( "Tip: Use `fastdeploy [serve|run-batch|bench ] " diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 7fbbb0ab2..e3de08033 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -28,6 +28,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType from fastdeploy.model_executor.graph_optimization.utils import ( + GPUMemoryChecker, profile_run_guard, sot_warmup_guard, ) @@ -155,6 +156,7 @@ class GPUModelRunner(ModelRunnerBase): self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.cudagraph_only_prefill = self.graph_opt_config.cudagraph_only_prefill + self.mem_checker = GPUMemoryChecker(device_id=self.device_id, print_debug_info=False) # Initialize share inputs self._init_share_inputs(self.scheduler_config.max_num_seqs) @@ -1660,67 +1662,90 @@ class GPUModelRunner(ModelRunnerBase): time_before_capture = time.perf_counter() expected_decode_len = 1 capture_sizes = self.cudagraph_capture_sizes.copy() - if self.fd_config.graph_opt_config.cudagraph_only_prefill: - for num_tokens in sorted(capture_sizes, reverse=True): - self._dummy_run( - num_tokens=num_tokens, - batch_size=self.scheduler_config.max_num_seqs, - in_capturing=True, - expected_decode_len=expected_decode_len, - capture_prefill=True, - ) - logger.info( - f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}" - ) - elif self.speculative_decoding and self.speculative_method == "mtp": - # Capture Target Model without bsz 1 - for batch_size in sorted(capture_sizes, reverse=True): - if batch_size == 1: - logger.info("Skip token_num = 1, when capture target model for mtp") - else: - assert batch_size % 2 == 0 + try: + if self.fd_config.graph_opt_config.cudagraph_only_prefill: + for num_tokens in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.scheduler_config.max_num_batched_tokens, - batch_size=int(batch_size / 2), + num_tokens=num_tokens, + batch_size=self.scheduler_config.max_num_seqs, in_capturing=True, - expected_decode_len=1, + expected_decode_len=expected_decode_len, + capture_prefill=True, ) - logger.info(f"Warm up the Target model with the num_tokens:{batch_size}, expected_decode_len:{1}") - # Capture Draft Model without bsz 1 - # NOTE(liujundong): expected_decode_len = 1, will affect mtp capture in cudagraph - for batch_size in sorted(capture_sizes, reverse=True): - if batch_size == 1: - logger.info("Skip token_num = 1, when capture Draft model for mtp") - else: - assert batch_size % 2 == 0 + logger.info( + f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}" + ) + elif self.speculative_decoding and self.speculative_method == "mtp": + # Capture Target Model without bsz 1 + for batch_size in sorted(capture_sizes, reverse=True): + if batch_size == 1: + logger.info("Skip token_num = 1, when capture target model for mtp") + else: + assert batch_size % 2 == 0 + self._dummy_run( + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=int(batch_size / 2), + in_capturing=True, + expected_decode_len=1, + ) + logger.info( + f"Warm up the Target model with the num_tokens:{batch_size}, expected_decode_len:{1}" + ) + # Capture Draft Model without bsz 1 + # NOTE(liujundong): expected_decode_len = 1, will affect mtp capture in cudagraph + for batch_size in sorted(capture_sizes, reverse=True): + if batch_size == 1: + logger.info("Skip token_num = 1, when capture Draft model for mtp") + else: + assert batch_size % 2 == 0 + self._dummy_run( + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=int(batch_size / 2), + in_capturing=True, + expected_decode_len=3, + accept_all_drafts=True, + ) + logger.info( + f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}" + ) + # Capture Draft Model with bsz 1 + if 1 in capture_sizes: self._dummy_run( num_tokens=self.scheduler_config.max_num_batched_tokens, - batch_size=int(batch_size / 2), + batch_size=int(1), in_capturing=True, expected_decode_len=3, - accept_all_drafts=True, + accept_all_drafts=False, ) logger.info(f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}") - # Capture Draft Model with bsz 1 - if 1 in capture_sizes: - self._dummy_run( - num_tokens=self.scheduler_config.max_num_batched_tokens, - batch_size=int(1), - in_capturing=True, - expected_decode_len=3, - accept_all_drafts=False, - ) - logger.info(f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}") - else: - for batch_size in sorted(capture_sizes, reverse=True): - self._dummy_run( - num_tokens=self.scheduler_config.max_num_batched_tokens, - batch_size=batch_size, - in_capturing=True, - expected_decode_len=expected_decode_len, - ) - logger.info(f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}") + else: + for batch_size in sorted(capture_sizes, reverse=True): + self._dummy_run( + num_tokens=self.scheduler_config.max_num_batched_tokens, + batch_size=batch_size, + in_capturing=True, + expected_decode_len=expected_decode_len, + ) + logger.info( + f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}" + ) + except RuntimeError as e: + if "out of memory" in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up CUDAGraph " + f"with the capture sizes {capture_sizes}. Please try " + "lowering `max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine." + ) from e + if "CUDA error(700)" in str(e): + raise RuntimeError( + "CUDA error(700), an illegal memory access was encountered, " + "when warming up CUDAGraph. Please try to set the startup parameter: " + "--graph-optimization-config '{\"use_cudagraph\": false}' to close CUDAGraph" + ) from e + else: + raise e time_after_capture = time.perf_counter() logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") @@ -1818,8 +1843,8 @@ class GPUModelRunner(ModelRunnerBase): self.model_config.max_model_len, ) - logits = None # 4. Compute logits, Sample + logits = None if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model: # TODO(lizexu123) The execution of the pooling function have not been implemented yet. pass @@ -1937,7 +1962,6 @@ class GPUModelRunner(ModelRunnerBase): # 7. Update 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - if not envs.ENABLE_V1_KVCACHE_SCHEDULER: step_cuda( self.share_inputs, diff --git a/tests/ce/deploy/21b_sot.yaml b/tests/ce/deploy/21b_sot.yaml index 396e1bd58..243e5335b 100644 --- a/tests/ce/deploy/21b_sot.yaml +++ b/tests/ce/deploy/21b_sot.yaml @@ -2,8 +2,8 @@ max_model_len: 32768 max_num_seqs: 128 tensor_parallel_size: 1 quantization: wint4 -use_cudagraph: True graph_optimization_config: graph_opt_level: 1 sot_warmup_sizes: [2,16,32,64] + use_cudagraph: True full_cuda_graph: False diff --git a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py index 6f9957c25..1e4944390 100644 --- a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -112,7 +112,6 @@ def setup_and_run_server(): "128", "--quantization", "wint4", - "--use-cudagraph", "--graph-optimization-config", '{"cudagraph_capture_sizes": [1]}', "--guided-decoding-backend", diff --git a/tests/conftest.py b/tests/conftest.py index d6a070d00..4e31414d0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,7 @@ class FDRunner: self, model_name_or_path: str, tensor_parallel_size: int = 1, + max_num_seqs: int = 1, max_model_len: int = 1024, load_choices: str = "default", quantization: str = "None", @@ -35,12 +36,16 @@ class FDRunner: ports_to_clean.append(kwargs["engine_worker_queue_port"]) clean_ports(ports_to_clean) time.sleep(5) + graph_optimization_config = {"use_cudagraph": False} self.llm = LLM( model=model_name_or_path, tensor_parallel_size=tensor_parallel_size, + max_num_seqs=max_num_seqs, max_model_len=max_model_len, load_choices=load_choices, quantization=quantization, + max_num_batched_tokens=max_model_len, + graph_optimization_config=graph_optimization_config, **kwargs, ) diff --git a/tests/e2e/test_DeepSeek_V3_5layers_serving.py b/tests/e2e/test_DeepSeek_V3_5layers_serving.py index 1c4366a37..d2e734a46 100644 --- a/tests/e2e/test_DeepSeek_V3_5layers_serving.py +++ b/tests/e2e/test_DeepSeek_V3_5layers_serving.py @@ -121,9 +121,8 @@ def setup_and_run_server(): "--quantization", "wint4", "--no-enable-prefix-caching", - "--use-cudagraph", "--graph-optimization-config", - '{"cudagraph_capture_sizes": [1]}', + '{"use_cudagraph":true, "cudagraph_capture_sizes": [1]}', ] # Start subprocess in new process group diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 432d07879..4f07e817f 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -119,9 +119,8 @@ def setup_and_run_server(): "128", "--quantization", "wint4", - "--use-cudagraph", "--graph-optimization-config", - '{"cudagraph_capture_sizes": [1]}', + '{"cudagraph_capture_sizes": [1], "use_cudagraph":true}', ] # Start subprocess in new process group diff --git a/tests/layers/test_plas_attention.py b/tests/layers/test_plas_attention.py index 9de05578a..663b27dc9 100644 --- a/tests/layers/test_plas_attention.py +++ b/tests/layers/test_plas_attention.py @@ -354,6 +354,7 @@ class TestPlasAttention(unittest.TestCase): "plas_decoder_top_k_right": 120, } + graph_optimization_config = {"use_cudagraph": False} # 加载模型 llm = LLM( model=model_path, @@ -366,6 +367,7 @@ class TestPlasAttention(unittest.TestCase): enable_chunked_prefill=True, max_num_batched_tokens=8192, plas_attention_config=plas_attention_config, + graph_optimization_config=graph_optimization_config, ) prompts = ["Hello world!"] diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py index 38333b8f8..e0d7f636d 100644 --- a/tests/model_loader/test_common_model.py +++ b/tests/model_loader/test_common_model.py @@ -32,24 +32,28 @@ from tests.model_loader.utils import ( FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) -prompts = ["解释下“温故而知新", "Hello, how are you?"] +prompts = ["解释下”温故而知新”", "Hello, how are you?"] model_param_map = { "Qwen3-0.6B": { + "max_num_seqs": 1, "quantizations": ["None", "wint8", "wint4"], }, "ernie-4_5-21b-a3b-bf16-paddle": { + "max_num_seqs": 1, "tensor_parallel_size": 2, "quantizations": [ "wint8", ], }, "Qwen2-7B-Instruct": { + "max_num_seqs": 1, "quantizations": ["wint4"], }, "Qwen3-30B-A3B": { "tensor_parallel_size": 2, + "max_num_seqs": 1, "quantizations": [ { "quant_type": "block_wise_fp8", @@ -91,6 +95,7 @@ for model, cfg in model_param_map.items(): pytest.param( model, cfg.get("tensor_parallel_size", 1), + cfg.get("max_num_seqs", 1), cfg.get("max_model_len", 1024), quant, cfg.get("max_tokens", 32), @@ -102,13 +107,14 @@ for model, cfg in model_param_map.items(): @pytest.mark.parametrize( - "model_name_or_path,tensor_parallel_size,max_model_len,quantization,max_tokens,env", + "model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env", params, ) def test_common_model( fd_runner, model_name_or_path: str, tensor_parallel_size: int, + max_num_seqs, max_model_len: int, max_tokens: int, quantization: str, @@ -126,6 +132,7 @@ def test_common_model( fd_runner, model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, @@ -141,6 +148,7 @@ def test_common_model( fd_runner, model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, diff --git a/tests/model_loader/test_model_cache.py b/tests/model_loader/test_model_cache.py index e48a136b6..f347b26b7 100644 --- a/tests/model_loader/test_model_cache.py +++ b/tests/model_loader/test_model_cache.py @@ -44,6 +44,8 @@ model_param_map = { "env": {"FD_ENABLE_MODEL_LOAD_CACHE": "1"}, } ], + "max_num_seqs": 1, + "graph_optimization_config": {"use_cudagraph": False}, } } @@ -59,6 +61,7 @@ for model, cfg in model_param_map.items(): pytest.param( model, cfg.get("tensor_parallel_size", 1), + cfg.get("max_num_seqs", 1), cfg.get("max_model_len", 1024), quant, cfg.get("max_tokens", 32), @@ -70,13 +73,14 @@ for model, cfg in model_param_map.items(): @pytest.mark.parametrize( - "model_name_or_path,tensor_parallel_size,max_model_len,quantization,max_tokens,env", + "model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env", params, ) def test_model_cache( fd_runner, model_name_or_path: str, tensor_parallel_size: int, + max_num_seqs: int, max_model_len: int, max_tokens: int, quantization: str, @@ -91,6 +95,7 @@ def test_model_cache( fd_runner, model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, @@ -111,6 +116,7 @@ def test_model_cache( fd_runner, model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, diff --git a/tests/model_loader/test_offline_model.py b/tests/model_loader/test_offline_model.py index 9ab408588..25373da43 100644 --- a/tests/model_loader/test_offline_model.py +++ b/tests/model_loader/test_offline_model.py @@ -44,6 +44,8 @@ model_param_map = { "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}, }, ], + "max_num_seqs": 1, + "graph_optimization_config": {"use_cudagraph": False}, }, } @@ -58,6 +60,7 @@ for model, cfg in model_param_map.items(): pytest.param( model, cfg.get("tensor_parallel_size", 1), + cfg.get("max_num_seqs", 1), cfg.get("max_model_len", 1024), quant, cfg.get("max_tokens", 32), @@ -69,13 +72,14 @@ for model, cfg in model_param_map.items(): @pytest.mark.parametrize( - "model_name_or_path,tensor_parallel_size,max_model_len,quantization,max_tokens,env", + "model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens,env", params, ) def test_offline_model( fd_runner, model_name_or_path: str, tensor_parallel_size: int, + max_num_seqs: int, max_model_len: int, max_tokens: int, quantization: str, @@ -93,6 +97,7 @@ def test_offline_model( fd_runner, torch_model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index ca9bcb883..841391b7a 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -75,10 +75,12 @@ def check_result_against_baseline(outputs, baseline_file, threshold=0.05): hugging_face_model_param_map = { "Qwen2.5-7B-Instruct": { + "max_num_seqs": 1, "tensor_parallel_size": 2, "quantizations": ["wint8"], }, "Qwen3-30B-A3B": { + "max_num_seqs": 1, "tensor_parallel_size": 2, "quantizations": ["wint8"], }, @@ -91,6 +93,7 @@ for model, cfg in hugging_face_model_param_map.items(): pytest.param( model, cfg.get("tensor_parallel_size", 2), + cfg.get("max_num_seqs", 1), cfg.get("max_model_len", 1024), q, cfg.get("max_tokens", 100), @@ -100,13 +103,14 @@ for model, cfg in hugging_face_model_param_map.items(): @pytest.mark.parametrize( - "model_name_or_path,tensor_parallel_size,max_model_len,quantization,max_tokens", + "model_name_or_path,tensor_parallel_size,max_num_seqs,max_model_len,quantization,max_tokens", hf_params, ) def test_model_against_baseline( fd_runner, model_name_or_path: str, tensor_parallel_size: int, + max_num_seqs: int, max_model_len: int, max_tokens: int, quantization: str, @@ -123,6 +127,7 @@ def test_model_against_baseline( fd_runner, torch_model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py index 3c405c3e7..302274193 100644 --- a/tests/model_loader/utils.py +++ b/tests/model_loader/utils.py @@ -78,6 +78,7 @@ def form_model_get_output_topp0( fd_runner, model_path, tensor_parallel_size, + max_num_seqs, max_model_len, max_tokens, quantization, @@ -91,6 +92,7 @@ def form_model_get_output_topp0( with fd_runner( model_path, tensor_parallel_size=tensor_parallel_size, + max_num_seqs=max_num_seqs, max_model_len=max_model_len, load_choices=load_choices, quantization=quantization,