mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
29
.clang-format
Normal file
29
.clang-format
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# This file is used by clang-format to autoformat paddle source code
|
||||||
|
#
|
||||||
|
# The clang-format is part of llvm toolchain.
|
||||||
|
# It need to install llvm and clang to format source code style.
|
||||||
|
#
|
||||||
|
# The basic usage is,
|
||||||
|
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||||
|
#
|
||||||
|
# The -style=file implicit use ".clang-format" file located in one of
|
||||||
|
# parent directory.
|
||||||
|
# The -i means inplace change.
|
||||||
|
#
|
||||||
|
# The document of clang-format is
|
||||||
|
# http://clang.llvm.org/docs/ClangFormat.html
|
||||||
|
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||||
|
---
|
||||||
|
Language: Cpp
|
||||||
|
BasedOnStyle: Google
|
||||||
|
IndentWidth: 4
|
||||||
|
TabWidth: 2
|
||||||
|
ContinuationIndentWidth: 4
|
||||||
|
AccessModifierOffset: -1 # The private/protected/public has no indent in class
|
||||||
|
Standard: Cpp11
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: true
|
||||||
|
BinPackParameters: false
|
||||||
|
BinPackArguments: false
|
||||||
|
IncludeBlocks: Preserve
|
||||||
|
IncludeIsMainSourceRegex: (\.cu)$
|
||||||
|
...
|
6
.gitignore
vendored
6
.gitignore
vendored
@@ -121,7 +121,7 @@ dmypy.json
|
|||||||
FETCH_HEAD
|
FETCH_HEAD
|
||||||
|
|
||||||
#log
|
#log
|
||||||
log/
|
log*/
|
||||||
|
|
||||||
checkpoints/
|
checkpoints/
|
||||||
checkpoints_origin/
|
checkpoints_origin/
|
||||||
@@ -158,3 +158,7 @@ custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
|
|||||||
|
|
||||||
# buff
|
# buff
|
||||||
custom_ops/tmp*
|
custom_ops/tmp*
|
||||||
|
|
||||||
|
build
|
||||||
|
|
||||||
|
.ccls-cache
|
||||||
|
@@ -16,7 +16,7 @@ repos:
|
|||||||
rev: v0.11.7
|
rev: v0.11.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix, --line-length=120]
|
||||||
# # 拼写检查
|
# # 拼写检查
|
||||||
# - repo: https://github.com/codespell-project/codespell
|
# - repo: https://github.com/codespell-project/codespell
|
||||||
# rev: v2.4.1
|
# rev: v2.4.1
|
||||||
@@ -29,14 +29,15 @@ repos:
|
|||||||
rev: 6.0.1
|
rev: 6.0.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
# 格式化
|
# # 格式化
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
# - repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
# rev: v20.1.3
|
||||||
hooks:
|
# hooks:
|
||||||
- id: clang-format
|
# - id: clang-format
|
||||||
# exclude: '.*'
|
# # exclude: '.*'
|
||||||
types_or: [c++, cuda]
|
# types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
# args: [--style=file, --verbose]
|
||||||
|
|
||||||
# markdown
|
# markdown
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.29
|
rev: v0.9.29
|
||||||
|
156
README.md
156
README.md
@@ -1,9 +1,8 @@
|
|||||||
# FastDeploy 2.0: 大模型推理部署
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
|
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
|
||||||
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
|
</p>
|
||||||
<a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
|
<p align="center">
|
||||||
|
<a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
|
||||||
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
|
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
|
||||||
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
|
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
|
||||||
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
|
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
|
||||||
@@ -11,105 +10,78 @@
|
|||||||
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
|
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
FastDeploy升级2.0版本支持多种大模型推理(当前仅支持Qwen2,更多模型即将更新支持),其推理部署功能涵盖:
|
<p align="center">
|
||||||
|
<a href="docs/get_started/installation/README.md"><b> Installation </b></a>
|
||||||
|
|
|
||||||
|
<a href="docs/get_started.md"><b> Quick Start </b></a>
|
||||||
|
|
|
||||||
|
<a href="docs/supported_models.md"><b> Supported Models </b></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
- 一行命令即可快速实现模型的服务化部署,并支持流式生成
|
--------------------------------------------------------------------------------
|
||||||
- 利用张量并行技术加速模型推理
|
# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
|
||||||
- 支持 PagedAttention 与 continuous batching(动态批处理)
|
|
||||||
- 兼容 OpenAI 的 HTTP 协议
|
|
||||||
- 提供 Weight only int8/int4 无损压缩方案
|
|
||||||
- 支持 Prometheus Metrics 指标
|
|
||||||
|
|
||||||
> 注意: 如果你还在使用FastDeploy部署小模型(如PaddleClas/PaddleOCR等CV套件模型),请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
|
## News
|
||||||
|
|
||||||
## 环境依赖
|
**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
|
||||||
- A800/H800/H100
|
|
||||||
- Python>=3.10
|
|
||||||
- CUDA>=12.3
|
|
||||||
- CUDNN>=9.5
|
|
||||||
- Linux X64
|
|
||||||
|
|
||||||
## 安装
|
## About
|
||||||
|
|
||||||
### Docker安装(推荐)
|
**FastDeploy** is an inference and deployment toolkit for large language models and visual language models based on PaddlePaddle. It delivers **production-ready, out-of-the-box deployment solutions** with core acceleration technologies:
|
||||||
```
|
|
||||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy:2.0.0.0-alpha
|
|
||||||
```
|
|
||||||
|
|
||||||
### 源码安装
|
- 🚀 **Load-Balanced PD Disaggregation**: Industrial-grade solution featuring context caching and dynamic instance role switching. Optimizes resource utilization while balancing SLO compliance and throughput.
|
||||||
#### 安装PaddlePaddle
|
- 🔄 **Unified KV Cache Transmission**: Lightweight high-performance transport library with intelligent NVLink/RDMA selection.
|
||||||
> 注意安装nightly build版本,代码版本需新于2025.05.30,详见[PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html),指定安装CUDA 12.6 develop(Nightly build)版本。
|
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
|
||||||
```
|
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
|
||||||
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
- ⏩ **Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
|
||||||
```
|
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
|
||||||
|
|
||||||
#### 编译安装FastDeploy
|
## Requirements
|
||||||
|
|
||||||
```
|
- OS: Linux
|
||||||
# 编译
|
- Python: 3.10 ~ 3.12
|
||||||
cd FastDeploy
|
|
||||||
bash build.sh
|
|
||||||
# 安装
|
|
||||||
pip install dist/fastdeploy-2.0.0a0-py3-none-any.whl
|
|
||||||
```
|
|
||||||
|
|
||||||
## 快速使用
|
## Installation
|
||||||
|
|
||||||
在安装后,执行如下命令快速部署Qwen2模型, 更多参数的配置与含义参考[参数说明](docs/serving.md).
|
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
|
||||||
|
|
||||||
``` shell
|
- [NVIDIA GPU](./docs/installation/nvidia_cuda.md)
|
||||||
# 下载与解压Qwen模型
|
- [Kunlunxin XPU](./docs/en/get_started/installation/kunlunxin_xpu.md)
|
||||||
wget https://fastdeploy.bj.bcebos.com/llm/models/Qwen2-7B-Instruct.tar.gz && tar xvf Qwen2-7B-Instruct.tar.gz
|
- [Iluvatar GPU](./docs/en/get_started/installation/iluvatar_gpu.md)
|
||||||
# 指定单卡部署
|
- [Enflame GCU](./docs/en/get_started/installation/Enflame_gcu.md)
|
||||||
python -m fastdeploy.entrypoints.openai.api_server --model ./Qwen2-7B-Instruct --port 8188 --tensor-parallel-size 1
|
|
||||||
```
|
|
||||||
|
|
||||||
使用如下命令请求模型服务
|
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
|
||||||
``` shell
|
|
||||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"messages": [
|
|
||||||
{"role": "user", "content": "你好,你的名字是什么?"}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
响应结果如下所示
|
|
||||||
``` json
|
|
||||||
{
|
|
||||||
"id": "chatcmpl-db662f47-7c8c-4945-9a7a-db563b2ddd8d",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": 1749451045,
|
|
||||||
"model": "default",
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "你好!我叫通义千问。",
|
|
||||||
"reasoning_content": null
|
|
||||||
},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": 25,
|
|
||||||
"total_tokens": 35,
|
|
||||||
"completion_tokens": 10,
|
|
||||||
"prompt_tokens_details": null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
FastDeploy提供与OpenAI完全兼容的服务API(字段`model`与`api_key`目前不支持,设定会被忽略),用户也可基于openai python api请求服务。
|
|
||||||
|
|
||||||
## 部署文档
|
## Get Started
|
||||||
- [本地部署](docs/offline_inference.md)
|
|
||||||
- [服务部署](docs/serving.md)
|
|
||||||
- [服务metrics](docs/metrics.md)
|
|
||||||
|
|
||||||
# 代码说明
|
Learn how to use FastDeploy through our documentation:
|
||||||
- [代码目录说明](docs/code_guide.md)
|
- [10-Minutes Quick Deployment](./docs/get_started/quick_start.md)
|
||||||
- FastDeploy的使用中存在任何建议和问题,欢迎通过issue反馈。
|
- [ERNIE-4.5 Large Language Model Deployment](./docs/get_started/ernie-4.5.md)
|
||||||
|
- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
|
||||||
|
- [Offline Inference Development](./docs/offline_inference.md)
|
||||||
|
- [Online Service Deployment](./docs/serving/README.md)
|
||||||
|
- [Full Supported Models List](./docs/supported_models.md)
|
||||||
|
|
||||||
# 开源说明
|
## Supported Models
|
||||||
FastDeploy遵循[Apache-2.0开源协议](./LICENSE)。 在本项目的开发中,为了对齐[vLLM](https://github.com/vllm-project/vllm)使用接口,参考和直接使用了部分vLLM代码,在此表示感谢。
|
|
||||||
|
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||||
|
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||||
|
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅(WINT4/W4A8C8/Expert Parallelism)| ✅ | ✅|✅(WINT4)| WIP |128K |
|
||||||
|
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅(WINT4/Expert Parallelism)| ✅ | ✅|✅(WINT4)| ❌ | 128K |
|
||||||
|
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||||
|
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||||
|
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||||
|
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||||
|
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||||
|
|
||||||
|
## Advanced Usage
|
||||||
|
|
||||||
|
- [Quantization](./docs/quantization/README.md)
|
||||||
|
- [PD Disaggregation Deployment](./docs/features/pd_disaggregation.md)
|
||||||
|
- [Speculative Decoding](./docs/features/speculative_decoding.md)
|
||||||
|
- [Prefix Caching](./docs/features/prefix_caching.md)
|
||||||
|
- [Chunked Prefill](./docs/features/chunked_prefill.md)
|
||||||
|
|
||||||
|
## Acknowledgement
|
||||||
|
|
||||||
|
FastDeploy is licensed under the [Apache-2.0 open-source license](./LICENSE). During development, portions of [vLLM](https://github.com/vllm-project/vllm) code were referenced and incorporated to maintain interface compatibility, for which we express our gratitude.
|
||||||
|
106
benchmarks/README.md
Normal file
106
benchmarks/README.md
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
### FastDeploy服务化性能压测工具
|
||||||
|
|
||||||
|
#### 数据集:
|
||||||
|
|
||||||
|
wget下载到本地用于性能测试
|
||||||
|
|
||||||
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th style="width:15%; text-align: left;">Dataset</th>
|
||||||
|
<th style="width:65%; text-align: left;">Data Path</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><strong>开源数据集 2k条</strong></td>
|
||||||
|
<td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
#### 使用方式:
|
||||||
|
|
||||||
|
```
|
||||||
|
# 安装依赖
|
||||||
|
python -m pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
##### 参数说明
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--backend openai-chat:压测使用的后端接口,指定为"openai-chat"使用chat/completion接口
|
||||||
|
--model EB45T:模型名,任意取名,影响最后保存的结果文件名 EB45T \
|
||||||
|
--endpoint /v1/chat/completions:endpoint,用于组url
|
||||||
|
--host 0.0.0.0:服务ip地址,用于组url
|
||||||
|
--port 9812:服务HTTP端口,用于组url
|
||||||
|
--dataset-name EBChat:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集
|
||||||
|
--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd:压测数据集路径
|
||||||
|
--hyperparameter-path EB45T.yaml:(可选)超参文件,请求时会更新进payload中,默认不带任何超参
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len:性能结果中展示的指标集合
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
|
||||||
|
--num-prompts 1:总计发送多少条请求
|
||||||
|
--max-concurrency 1:压测并发数
|
||||||
|
--save-result:开启结果保存,结果文件会存入json
|
||||||
|
```
|
||||||
|
|
||||||
|
##### /v1/chat/completions接口压测单条数据调试
|
||||||
|
|
||||||
|
```
|
||||||
|
python benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model EB45T \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 9812 \
|
||||||
|
--dataset-name EBChat \
|
||||||
|
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||||
|
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||||
|
--num-prompts 1 \
|
||||||
|
--max-concurrency 1 \
|
||||||
|
--save-result
|
||||||
|
```
|
||||||
|
|
||||||
|
##### /v1/chat/completions接口完整100并发 2000条压测
|
||||||
|
|
||||||
|
```
|
||||||
|
# 保存infer_log.txt
|
||||||
|
python benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model EB45T \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 9812 \
|
||||||
|
--dataset-name EBChat \
|
||||||
|
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||||
|
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||||
|
--num-prompts 2000 \
|
||||||
|
--max-concurrency 100 \
|
||||||
|
--save-result > infer_log.txt 2>&1 &
|
||||||
|
```
|
||||||
|
|
||||||
|
##### /v1/completions接口压测
|
||||||
|
|
||||||
|
修改endpoint为/v1/completions,backend为openai,会对/v1/completions接口进行压测
|
||||||
|
|
||||||
|
```
|
||||||
|
# 保存infer_log.txt
|
||||||
|
python benchmark_serving.py \
|
||||||
|
--backend openai \
|
||||||
|
--model EB45T \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 9812 \
|
||||||
|
--dataset-name EBChat \
|
||||||
|
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||||
|
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||||
|
--num-prompts 2000 \
|
||||||
|
--max-concurrency 100 \
|
||||||
|
--save-result > infer_log.txt 2>&1 &
|
||||||
|
```
|
||||||
|
|
700
benchmarks/backend_request_func.py
Normal file
700
benchmarks/backend_request_func.py
Normal file
@@ -0,0 +1,700 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
|
||||||
|
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncInput:
|
||||||
|
"""Input for requesting LLMs via API"""
|
||||||
|
prompt: str
|
||||||
|
history_QA: Optional[dict]
|
||||||
|
hyper_parameters: dict
|
||||||
|
api_url: str
|
||||||
|
prompt_len: int
|
||||||
|
output_len: int
|
||||||
|
model: str
|
||||||
|
model_name: Optional[str] = None
|
||||||
|
logprobs: Optional[int] = None
|
||||||
|
extra_body: Optional[dict] = None
|
||||||
|
multi_modal_content: Optional[dict] = None
|
||||||
|
ignore_eos: bool = False
|
||||||
|
language: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncOutput:
|
||||||
|
"""Output for requesting LLMs via API"""
|
||||||
|
generated_text: str = ""
|
||||||
|
reasoning_content: str = ""
|
||||||
|
success: bool = False
|
||||||
|
latency: float = 0.0
|
||||||
|
output_tokens: int = 0
|
||||||
|
ttft: float = 0.0 # Time to first token
|
||||||
|
arrival_time: list = field(default_factory=list) # arrival_time
|
||||||
|
itl: list = field(default_factory=list) # list of inter-token latencies
|
||||||
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
|
prompt_len: int = 0
|
||||||
|
prompt_tokens: int = 0 # 推理侧返回输入token数
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_eb_openai_chat_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using EB OpenAI"""
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
("completions", "profile")
|
||||||
|
), "OpenAI Chat Completions API URL must end with 'completions'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
|
if request_func_input.multi_modal_content:
|
||||||
|
content.append(request_func_input.multi_modal_content)
|
||||||
|
payload = {
|
||||||
|
"model": "default",
|
||||||
|
"messages": request_func_input.history_QA,
|
||||||
|
"stream": True,
|
||||||
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
"continuous_usage_stats": True
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# 超参由yaml传入
|
||||||
|
payload.update(request_func_input.hyper_parameters)
|
||||||
|
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = 0
|
||||||
|
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
|
if chunk != "[DONE]":
|
||||||
|
# print("####chunk:", chunk, type(chunk))
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
if choices := data.get("choices"):
|
||||||
|
content = choices[0]["delta"].get("content")
|
||||||
|
reason_content = choices[0]["delta"].get("reasoning_content")
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = timestamp - st
|
||||||
|
output.ttft = ttft
|
||||||
|
# cached_tokens
|
||||||
|
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
output.generated_text += content or ""
|
||||||
|
output.reasoning_content += reason_content or ""
|
||||||
|
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
output.prompt_tokens = usage.get(
|
||||||
|
"prompt_tokens")
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
# output.generated_text = generated_text
|
||||||
|
if output.generated_text.strip() == "":
|
||||||
|
output.success = False
|
||||||
|
output.error = "No generated text found!"
|
||||||
|
else:
|
||||||
|
output.success = True
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
else:
|
||||||
|
error_text = await response.text()
|
||||||
|
print("####error response:", error_text, "####payload:", payload)
|
||||||
|
output.error = error_text or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
# 保存失败请求结果
|
||||||
|
if not output.success:
|
||||||
|
with open("error_output.txt", "a") as f:
|
||||||
|
f.write(str(output) + "\n")
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_eb_openai_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using EB OpenAI"""
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
("completions", "profile")
|
||||||
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
payload = {
|
||||||
|
"model": "default",
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"stream": True,
|
||||||
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
"continuous_usage_stats": True
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# 超参由yaml传入
|
||||||
|
payload.update(request_func_input.hyper_parameters)
|
||||||
|
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
first_chunk_received = False
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
|
if chunk != "[DONE]":
|
||||||
|
# print("####chunk:", chunk, chunk.usage)
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# want to check a token was generated
|
||||||
|
if choices := data.get("choices"):
|
||||||
|
# Note that text could be empty here
|
||||||
|
# e.g. for special tokens
|
||||||
|
text = choices[0].get("text")
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if not first_chunk_received:
|
||||||
|
first_chunk_received = True
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||||
|
generated_text += text or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.prompt_tokens = usage.get(
|
||||||
|
"prompt_tokens")
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
if first_chunk_received:
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
output.error = (
|
||||||
|
"Never received a valid chunk to calculate TTFT."
|
||||||
|
"This response will be marked as failed!")
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_tgi(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using the TGI API"""
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
params = {
|
||||||
|
"max_new_tokens": request_func_input.output_len,
|
||||||
|
"do_sample": True,
|
||||||
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
|
"truncate": request_func_input.prompt_len,
|
||||||
|
"ignore_eos_token": request_func_input.ignore_eos,
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"inputs": request_func_input.prompt,
|
||||||
|
"parameters": params,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
output.output_tokens = request_func_input.output_len
|
||||||
|
else:
|
||||||
|
output.output_tokens = None
|
||||||
|
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
|
# NOTE: Sometimes TGI returns a ping response without
|
||||||
|
# any data, we should skip it.
|
||||||
|
if chunk_bytes.startswith(":"):
|
||||||
|
continue
|
||||||
|
chunk = chunk_bytes.removeprefix("data:")
|
||||||
|
|
||||||
|
data = json.loads(chunk)
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
output.arrival_time.append(data["arrival_time"])
|
||||||
|
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
output.success = True
|
||||||
|
output.generated_text = data["generated_text"]
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_trt_llm(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using TRT's llm_server"""
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
payload = {
|
||||||
|
"accumulate_tokens": True,
|
||||||
|
"text_input": request_func_input.prompt,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["min_length"] = request_func_input.output_len
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data:")
|
||||||
|
|
||||||
|
data = json.loads(chunk)
|
||||||
|
output.generated_text += data["text_output"]
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = timestamp - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
output.success = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_deepspeed_mii(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using Deepspeed MII"""
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
|
"top_p": 1.0,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
|
||||||
|
# will use 0 as placeholder.
|
||||||
|
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
|
||||||
|
output.ttft = 0
|
||||||
|
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=request_func_input.api_url,
|
||||||
|
json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
parsed_resp = await response.json()
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
if "choices" in parsed_resp:
|
||||||
|
output.generated_text = parsed_resp["choices"][0][
|
||||||
|
"text"]
|
||||||
|
elif "text" in parsed_resp:
|
||||||
|
output.generated_text = parsed_resp["text"][0]
|
||||||
|
else:
|
||||||
|
output.error = ("Unexpected response format: "
|
||||||
|
"neither 'choices' nor 'text' found")
|
||||||
|
output.success = False
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using OpenAI"""
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
("completions", "profile")
|
||||||
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
# "temperature": 0.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"logprobs": request_func_input.logprobs,
|
||||||
|
"stream": True,
|
||||||
|
#"stream_options": {
|
||||||
|
# "include_usage": True,
|
||||||
|
#},
|
||||||
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
first_chunk_received = False
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
|
if chunk != "[DONE]":
|
||||||
|
# print("####chunk:", chunk, type(chunk))
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# want to check a token was generated
|
||||||
|
if choices := data.get("choices"):
|
||||||
|
# Note that text could be empty here
|
||||||
|
# e.g. for special tokens
|
||||||
|
text = choices[0].get("text")
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if not first_chunk_received:
|
||||||
|
first_chunk_received = True
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
generated_text += text or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
if first_chunk_received:
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.success = False
|
||||||
|
output.error = (
|
||||||
|
"Never received a valid chunk to calculate TTFT."
|
||||||
|
"This response will be marked as failed!")
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_audio(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""Request an LLM using OpenAI"""
|
||||||
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
|
import soundfile
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
("transcriptions", "translations"
|
||||||
|
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||||
|
"or `translations`."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
"language": "en",
|
||||||
|
# Flattened due to multipart/form-data
|
||||||
|
"stream_include_usage": True,
|
||||||
|
"stream_continuous_usage_stats": True
|
||||||
|
}
|
||||||
|
if request_func_input.extra_body:
|
||||||
|
payload.update(request_func_input.extra_body)
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send audio file
|
||||||
|
def to_bytes(y, sr):
|
||||||
|
buffer = io.BytesIO()
|
||||||
|
soundfile.write(buffer, y, sr, format="WAV")
|
||||||
|
buffer.seek(0)
|
||||||
|
return buffer
|
||||||
|
|
||||||
|
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
||||||
|
form = aiohttp.FormData()
|
||||||
|
form.add_field('file', f, content_type='audio/wav')
|
||||||
|
for key, value in payload.items():
|
||||||
|
form.add_field(key, str(value))
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url,
|
||||||
|
data=form,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
|
"data: ")
|
||||||
|
if chunk != "[DONE]":
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
if choices := data.get("choices"):
|
||||||
|
content = choices[0]["delta"].get(
|
||||||
|
"content")
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = timestamp - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(
|
||||||
|
timestamp - most_recent_timestamp)
|
||||||
|
|
||||||
|
generated_text += content or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
ASYNC_REQUEST_FUNCS = {
|
||||||
|
"tgi": async_request_tgi,
|
||||||
|
"vllm": async_request_openai_completions,
|
||||||
|
"lmdeploy": async_request_openai_completions,
|
||||||
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
|
"openai": async_request_eb_openai_completions,
|
||||||
|
"openai-chat": async_request_eb_openai_chat_completions,
|
||||||
|
"openai-audio": async_request_openai_audio,
|
||||||
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
"scalellm": async_request_openai_completions,
|
||||||
|
"sglang": async_request_openai_completions,
|
||||||
|
}
|
||||||
|
|
||||||
|
OPENAI_COMPATIBLE_BACKENDS = [
|
||||||
|
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||||
|
if v in (async_request_openai_completions,
|
||||||
|
async_request_eb_openai_chat_completions)
|
||||||
|
]
|
||||||
|
|
309
benchmarks/benchmark_dataset.py
Normal file
309
benchmarks/benchmark_dataset.py
Normal file
@@ -0,0 +1,309 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
|
||||||
|
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Callable, Optional, Union
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SampleRequest:
|
||||||
|
"""
|
||||||
|
Represents a single inference request for benchmarking.
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt: Union[str, Any]
|
||||||
|
history_QA: Union[str, Any]
|
||||||
|
json_data: Optional[dict]
|
||||||
|
prompt_len: int
|
||||||
|
expected_output_len: int
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkDataset(ABC):
|
||||||
|
"""BenchmarkDataset"""
|
||||||
|
DEFAULT_SEED = 0
|
||||||
|
IS_MULTIMODAL = False
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dataset_path: Optional[str] = None,
|
||||||
|
random_seed: int = DEFAULT_SEED,
|
||||||
|
hyperparameter_path: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the BenchmarkDataset with an optional dataset path and random
|
||||||
|
seed. Args:
|
||||||
|
dataset_path (Optional[str]): Path to the dataset. If None, it
|
||||||
|
indicates that a default or random dataset might be used.
|
||||||
|
random_seed (int): Seed value for reproducible shuffling or
|
||||||
|
sampling. Defaults to DEFAULT_SEED.
|
||||||
|
"""
|
||||||
|
self.dataset_path = dataset_path
|
||||||
|
# Set the random seed, ensuring that a None value is replaced with the
|
||||||
|
# default seed.
|
||||||
|
self.random_seed = (random_seed
|
||||||
|
if random_seed is not None else self.DEFAULT_SEED)
|
||||||
|
self.data = None
|
||||||
|
self.hyperparameter_path = hyperparameter_path
|
||||||
|
self.hyperparameters = {}
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
"""
|
||||||
|
Load data from the dataset path into self.data.
|
||||||
|
|
||||||
|
This method must be overridden by subclasses since the method to load
|
||||||
|
data will vary depending on the dataset format and source.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
NotImplementedError: If a subclass does not implement this method.
|
||||||
|
"""
|
||||||
|
# TODO (jenniferzhao): add support for downloading data
|
||||||
|
raise NotImplementedError(
|
||||||
|
"load_data must be implemented in subclasses.")
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def sample(self, num_requests: int) -> list[SampleRequest]:
|
||||||
|
"""
|
||||||
|
Abstract method to generate sample requests from the dataset.
|
||||||
|
|
||||||
|
Subclasses must override this method to implement dataset-specific logic
|
||||||
|
for generating a list of SampleRequest objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_requests (int): The number of sample requests to generate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[SampleRequest]: A list of sample requests generated from the
|
||||||
|
dataset.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||||
|
|
||||||
|
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||||
|
num_requests: int) -> None:
|
||||||
|
"""
|
||||||
|
Oversamples the list of requests if its size is less than the desired
|
||||||
|
number.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests (List[SampleRequest]): The current list of sampled
|
||||||
|
requests. num_requests (int): The target number of requests.
|
||||||
|
"""
|
||||||
|
if len(requests) < num_requests:
|
||||||
|
random.seed(self.random_seed)
|
||||||
|
additional = random.choices(requests,
|
||||||
|
k=num_requests - len(requests))
|
||||||
|
requests.extend(additional)
|
||||||
|
logger.info("Oversampled requests to reach %d total samples.",
|
||||||
|
num_requests)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_sequence(
|
||||||
|
prompt_len: int,
|
||||||
|
output_len: int,
|
||||||
|
min_len: int = 4,
|
||||||
|
max_prompt_len: int = 1024,
|
||||||
|
max_total_len: int = 2048,
|
||||||
|
skip_min_output_len_check: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Validate a sequence based on prompt and output lengths.
|
||||||
|
|
||||||
|
Default pruning criteria are copied from the original `sample_hf_requests`
|
||||||
|
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
|
||||||
|
from `sample_requests` in benchmark_throughput.py.
|
||||||
|
"""
|
||||||
|
# Check for invalid conditions
|
||||||
|
prompt_too_short = prompt_len < min_len
|
||||||
|
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||||
|
< min_len)
|
||||||
|
prompt_too_long = prompt_len > max_prompt_len
|
||||||
|
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||||
|
|
||||||
|
# Return True if none of the invalid conditions are met
|
||||||
|
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||||
|
or combined_too_long)
|
||||||
|
|
||||||
|
|
||||||
|
def process_image(image: Any) -> Mapping[str, Any]:
|
||||||
|
"""
|
||||||
|
Process a single image input and return a multimedia content dictionary.
|
||||||
|
|
||||||
|
Supports three input types:
|
||||||
|
|
||||||
|
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
||||||
|
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
||||||
|
|
||||||
|
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
||||||
|
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
||||||
|
a dictionary with the image as a base64 data URL.
|
||||||
|
|
||||||
|
3. String input: - Treats the string as a URL or local file path. -
|
||||||
|
Prepends "file://" if the string doesn't start with "http://" or
|
||||||
|
"file://". - Returns a dictionary with the image URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the input is not a supported type.
|
||||||
|
"""
|
||||||
|
if isinstance(image, dict) and 'bytes' in image:
|
||||||
|
image = Image.open(BytesIO(image['bytes']))
|
||||||
|
if isinstance(image, Image.Image):
|
||||||
|
image = image.convert("RGB")
|
||||||
|
with io.BytesIO() as image_data:
|
||||||
|
image.save(image_data, format="JPEG")
|
||||||
|
image_base64 = base64.b64encode(
|
||||||
|
image_data.getvalue()).decode("utf-8")
|
||||||
|
return {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(image, str):
|
||||||
|
image_url = (image if image.startswith(
|
||||||
|
("http://", "file://")) else f"file://{image}")
|
||||||
|
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||||
|
" or str or dictionary with raw image bytes.")
|
||||||
|
|
||||||
|
|
||||||
|
class EBDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||||
|
sample requests based on conversation turns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
temperature: float
|
||||||
|
repetition_penalty: float
|
||||||
|
frequency_penalty: float
|
||||||
|
presence_penalty: float
|
||||||
|
top_p: float
|
||||||
|
prompt_len: int
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
with open(self.dataset_path, encoding="utf-8") as f:
|
||||||
|
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
num_requests: int,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
samples: list = []
|
||||||
|
for entry in self.data:
|
||||||
|
if len(samples) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt = entry["text"]
|
||||||
|
self.temperature = float(entry["temperature"])
|
||||||
|
self.repetition_penalty = float(entry["penalty_score"])
|
||||||
|
self.frequency_penalty = float(entry["frequency_score"])
|
||||||
|
self.presence_penalty = float(entry["presence_score"])
|
||||||
|
self.top_p = float(entry["topp"])
|
||||||
|
self.prompt_len = int(entry["input_token_num"])
|
||||||
|
new_output_len = int(entry["max_dec_len"])
|
||||||
|
|
||||||
|
if enable_multimodal_chat:
|
||||||
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, None)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=self.prompt_len,
|
||||||
|
history_QA=[],
|
||||||
|
expected_output_len=new_output_len,
|
||||||
|
))
|
||||||
|
|
||||||
|
self.maybe_oversample_requests(samples, num_requests)
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
class EBChatDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||||
|
sample requests based on conversation turns.
|
||||||
|
"""
|
||||||
|
prompt_len: int
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
with open(self.dataset_path, encoding="utf-8") as f:
|
||||||
|
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
num_requests: int,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
samples: list = []
|
||||||
|
for entry in self.data:
|
||||||
|
if len(samples) >= num_requests:
|
||||||
|
break
|
||||||
|
json_data = entry
|
||||||
|
prompt = entry["messages"][-1].get("content", "")
|
||||||
|
history_QA = entry.get("messages", [])
|
||||||
|
new_output_len = int(entry.get("max_tokens", 12288))
|
||||||
|
|
||||||
|
if enable_multimodal_chat:
|
||||||
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, None)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
json_data=json_data,
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=0,
|
||||||
|
history_QA=history_QA,
|
||||||
|
expected_output_len=new_output_len,
|
||||||
|
))
|
||||||
|
|
||||||
|
self.maybe_oversample_requests(samples, num_requests)
|
||||||
|
return samples
|
||||||
|
|
1141
benchmarks/benchmark_serving.py
Normal file
1141
benchmarks/benchmark_serving.py
Normal file
File diff suppressed because it is too large
Load Diff
90
benchmarks/benchmark_utils.py
Normal file
90
benchmarks/benchmark_utils.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
metrics: dict[str, list],
|
||||||
|
extra_info: dict[str, Any]) -> list:
|
||||||
|
"""
|
||||||
|
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||||
|
on metric per record
|
||||||
|
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||||
|
"""
|
||||||
|
records = []
|
||||||
|
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||||
|
return records
|
||||||
|
|
||||||
|
for name, benchmark_values in metrics.items():
|
||||||
|
record = {
|
||||||
|
"benchmark": {
|
||||||
|
"name": "vLLM benchmark",
|
||||||
|
"extra_info": {
|
||||||
|
"args": vars(args),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"name": args.model,
|
||||||
|
},
|
||||||
|
"metric": {
|
||||||
|
"name": name,
|
||||||
|
"benchmark_values": benchmark_values,
|
||||||
|
"extra_info": extra_info,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||||
|
"tensor_parallel_size")
|
||||||
|
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||||
|
if not tp and "tensor_parallel_size" in extra_info:
|
||||||
|
record["benchmark"]["extra_info"]["args"][
|
||||||
|
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||||
|
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
class InfEncoder(json.JSONEncoder):
|
||||||
|
"""InfEncoder"""
|
||||||
|
def clear_inf(self, o: Any):
|
||||||
|
"""clear_inf"""
|
||||||
|
if isinstance(o, dict):
|
||||||
|
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||||
|
elif isinstance(o, list):
|
||||||
|
return [self.clear_inf(v) for v in o]
|
||||||
|
elif isinstance(o, float) and math.isinf(o):
|
||||||
|
return "inf"
|
||||||
|
return o
|
||||||
|
|
||||||
|
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||||
|
"""iterencode"""
|
||||||
|
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
|
"""write_to_json"""
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(records, f, cls=InfEncoder)
|
||||||
|
|
5
benchmarks/requirements.txt
Normal file
5
benchmarks/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
aiohttp
|
||||||
|
tqdm
|
||||||
|
numpy
|
||||||
|
Pillow
|
||||||
|
pyyaml
|
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
enable_chunked_prefill: True
|
||||||
|
max_model_len: 131072
|
||||||
|
max_num_seqs: 16
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
max_num_batched_tokens: 4096
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 131072
|
||||||
|
max_num_seqs: 40
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint4
|
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
enable_chunked_prefill: True
|
||||||
|
max_model_len: 131072
|
||||||
|
max_num_seqs: 16
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
max_num_batched_tokens: 4096
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
max_num_batched_tokens: 384
|
||||||
|
quantization: wint4
|
||||||
|
reasoning_parser: ernie-45-vl
|
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
max_num_batched_tokens: 32768
|
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 32
|
||||||
|
kv_cache_ratio: 0.5
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint4
|
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
quantization: wint4
|
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
|
quantization: wint8
|
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
max_num_batched_tokens: 32768
|
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: block_wise_fp8
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
max_num_batched_tokens: 1024
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
||||||
|
enable_prefix_caching: True
|
||||||
|
swap_space: 200
|
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
max_num_batched_tokens: 1024
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
||||||
|
enable_prefix_caching: True
|
||||||
|
swap_space: 200
|
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 96
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 4
|
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
cache_queue_port: 55663
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
splitwise_role: decode
|
||||||
|
engine_worker_queue_port: 6678
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7671,7672,7673,7674"
|
||||||
|
pd_comm_port: "2334"
|
||||||
|
max_num_batched_tokens: 384
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 16
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.9
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
splitwise_role: prefill
|
||||||
|
enable_prefix_caching: True
|
||||||
|
cache_queue_port: 55664
|
||||||
|
engine_worker_queue_port: 6677
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7675,7676,7677,7678"
|
||||||
|
pd_comm_port: "2333"
|
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_prefix_caching: true
|
||||||
|
enable_chunked_prefill: true
|
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 96
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 4
|
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
data_parallel_size: 8
|
||||||
|
num_gpu_blocks_override: 1024
|
||||||
|
cache_queue_port: 55663
|
||||||
|
splitwise_role: decode
|
||||||
|
engine_worker_queue_port: 6678
|
||||||
|
cache_transfer_protocol: "rdma"
|
||||||
|
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||||
|
pd_comm_port: "2334"
|
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 16
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.9
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
data_parallel_size: 8
|
||||||
|
splitwise_role: prefill
|
||||||
|
cache_queue_port: 55664
|
||||||
|
engine_worker_queue_port: 6677
|
||||||
|
num_gpu_blocks_override: 1024
|
||||||
|
cache_transfer_protocol: "rdma"
|
||||||
|
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||||
|
pd_comm_port: "2334"
|
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 96
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint4
|
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.7
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
cache_queue_port: 55663
|
||||||
|
enable_chunked_prefill: False
|
||||||
|
enable_prefix_caching: False
|
||||||
|
splitwise_role: decode
|
||||||
|
engine_worker_queue_port: 6678
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7671,7672,7673,7674"
|
||||||
|
pd_comm_port: "2334"
|
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 16
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.9
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
splitwise_role: prefill
|
||||||
|
enable_prefix_caching: False
|
||||||
|
cache_queue_port: 55664
|
||||||
|
engine_worker_queue_port: 6677
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7675,7676,7677,7678"
|
||||||
|
pd_comm_port: "2333"
|
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 40
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint4
|
||||||
|
gpu_memory_utilization: 0.9
|
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 160
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint4
|
||||||
|
gpu_memory_utilization: 0.9
|
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
enable_prefix_caching: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
swap_space: 200
|
||||||
|
cache_queue_port: 55664
|
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
cache_queue_port: 55663
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
splitwise_role: decode
|
||||||
|
engine_worker_queue_port: 6678
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7671,7672,7673,7674"
|
||||||
|
pd_comm_port: "2334"
|
||||||
|
max_num_batched_tokens: 384
|
||||||
|
max_num_partial_prefills: 3
|
||||||
|
max_long_partial_prefills: 3
|
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 16
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.9
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
splitwise_role: prefill
|
||||||
|
enable_prefix_caching: True
|
||||||
|
cache_queue_port: 55664
|
||||||
|
engine_worker_queue_port: 6677
|
||||||
|
cache_transfer_protocol: "rdma,ipc"
|
||||||
|
rdma_comm_ports: "7675,7676,7677,7678"
|
||||||
|
pd_comm_port: "2333"
|
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 96
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 8
|
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 80
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint8
|
||||||
|
gpu_memory_utilization: 0.9
|
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
enable_prefix_caching: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_batched_tokens: 68304
|
||||||
|
max_num_seqs: 128
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
swap_space: 100
|
||||||
|
cache_queue_port: 55664
|
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 56
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint4
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
reasoning_parser: ernie-45-vl
|
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 56
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint4
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
max_num_batched_tokens: 384
|
||||||
|
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 36
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint4
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 36
|
||||||
|
gpu_memory_utilization: 0.95
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint8
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
reasoning_parser: ernie-45-vl
|
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 36
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint8
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
enable_chunked_prefill: True
|
||||||
|
max_num_batched_tokens: 384
|
||||||
|
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
enable_mm: True
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 36
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.8
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint8
|
||||||
|
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||||
|
reasoning_parser: ernie-45-vl
|
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint8
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint8
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint4
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 96
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.71
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wfp8afp8
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wfp8afp8
|
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint8
|
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint8
|
||||||
|
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint8
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
enable_static_graph_inference: True
|
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
||||||
|
quantization: wint4
|
||||||
|
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint8
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint8
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 256
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint8
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 75
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint4
|
||||||
|
tensor_parallel_size: 4
|
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 25
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint8
|
||||||
|
tensor_parallel_size: 4
|
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 50
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 50
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 50
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint4
|
||||||
|
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 50
|
||||||
|
gpu_memory_utilization: 0.8
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
quantization: wint4
|
||||||
|
tensor_parallel_size: 1
|
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
top_p: 0.8
|
||||||
|
temperature: 0.8
|
||||||
|
metadata:
|
||||||
|
min_tokens: 1
|
||||||
|
max_tokens: 131071
|
||||||
|
repetition_penalty: 1.0
|
||||||
|
frequency_penalty: 0
|
||||||
|
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
top_p: 0.8
|
||||||
|
temperature: 0.8
|
||||||
|
metadata:
|
||||||
|
min_tokens: 1
|
||||||
|
max_tokens: 12288
|
||||||
|
repetition_penalty: 1.0
|
||||||
|
frequency_penalty: 0
|
||||||
|
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
top_p: 0.8
|
||||||
|
temperature: 0.7
|
||||||
|
metadata:
|
||||||
|
min_tokens: 1
|
||||||
|
max_tokens: 12288
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
frequency_penalty: 0
|
||||||
|
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
top_p: 0.8
|
||||||
|
temperature: 0.7
|
||||||
|
metadata:
|
||||||
|
min_tokens: 1
|
||||||
|
max_tokens: 12288
|
||||||
|
repetition_penalty: 1.0
|
||||||
|
frequency_penalty: 0
|
||||||
|
presence_penalty: 1.5
|
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
top_p: 0.95
|
||||||
|
temperature: 0.6
|
||||||
|
metadata:
|
||||||
|
min_tokens: 1
|
||||||
|
max_tokens: 32767
|
||||||
|
repetition_penalty: 1.0
|
||||||
|
frequency_penalty: 0
|
||||||
|
presence_penalty: 0
|
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
tensor_parallel_size: 8
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 32
|
||||||
|
num_gpu_blocks_override: 4096
|
||||||
|
kv_cache_ratio: 0.5
|
||||||
|
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 32
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint4
|
||||||
|
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint4
|
||||||
|
reasoning_parser: ernie-x1
|
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
enable_prefix_caching: True
|
||||||
|
num_gpu_blocks_override: 8000
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 64
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
kv_cache_ratio: 0.5
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
swap_space: 200
|
||||||
|
cache_queue_port: 55664
|
||||||
|
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
tensor_parallel_size: 8
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 32
|
||||||
|
num_gpu_blocks_override: 4096
|
||||||
|
kv_cache_ratio: 0.5
|
||||||
|
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 8
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
tensor_parallel_size: 4
|
||||||
|
quantization: wint8
|
||||||
|
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 64
|
||||||
|
gpu_memory_utilization: 0.9
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
quantization: wint8
|
||||||
|
reasoning_parser: ernie-x1
|
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
enable_prefix_caching: True
|
||||||
|
num_gpu_blocks_override: 8000
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 64
|
||||||
|
gpu_memory_utilization: 0.85
|
||||||
|
kv_cache_ratio: 0.5
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
swap_space: 200
|
||||||
|
cache_queue_port: 55664
|
||||||
|
reasoning_parser: ernie-x1
|
66
build.sh
66
build.sh
@@ -17,8 +17,9 @@
|
|||||||
BUILD_WHEEL=${1:-1}
|
BUILD_WHEEL=${1:-1}
|
||||||
PYTHON_VERSION=${2:-"python"}
|
PYTHON_VERSION=${2:-"python"}
|
||||||
export python=$PYTHON_VERSION
|
export python=$PYTHON_VERSION
|
||||||
CPU_USE_BF16=${3:-"false"}
|
FD_CPU_USE_BF16=${3:-"false"}
|
||||||
BUILDING_ARCS=${4:-""}
|
FD_BUILDING_ARCS=${4:-""}
|
||||||
|
|
||||||
|
|
||||||
# paddle distributed use to set archs
|
# paddle distributed use to set archs
|
||||||
unset PADDLE_CUDA_ARCH_LIST
|
unset PADDLE_CUDA_ARCH_LIST
|
||||||
@@ -30,13 +31,9 @@ EGG_DIR="fastdeploy.egg-info"
|
|||||||
|
|
||||||
# custom_ops directory config
|
# custom_ops directory config
|
||||||
OPS_SRC_DIR="custom_ops"
|
OPS_SRC_DIR="custom_ops"
|
||||||
OPS_BUILD_DIR="build"
|
|
||||||
OPS_EGG_DIR="efficitentllm_ops.egg-info"
|
|
||||||
OPS_TMP_DIR_BASE="tmp_base"
|
OPS_TMP_DIR_BASE="tmp_base"
|
||||||
OPS_TMP_DIR="tmp"
|
OPS_TMP_DIR="tmp"
|
||||||
|
|
||||||
TEST_DIR="tests"
|
|
||||||
|
|
||||||
# command line log config
|
# command line log config
|
||||||
RED='\033[0;31m'
|
RED='\033[0;31m'
|
||||||
BLUE='\033[0;34m'
|
BLUE='\033[0;34m'
|
||||||
@@ -44,13 +41,14 @@ GREEN='\033[1;32m'
|
|||||||
BOLD='\033[1m'
|
BOLD='\033[1m'
|
||||||
NONE='\033[0m'
|
NONE='\033[0m'
|
||||||
|
|
||||||
|
DEVICE_TYPE="gpu"
|
||||||
|
|
||||||
function python_version_check() {
|
function python_version_check() {
|
||||||
PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
|
PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
|
||||||
PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
|
PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
|
||||||
echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
|
echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
|
||||||
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "8" ]; then
|
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "9" ]; then
|
||||||
echo -e "${RED}FAIL:${NONE} please use Python >= 3.8"
|
echo -e "${RED}FAIL:${NONE} please use Python >= 3.9"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
@@ -75,6 +73,7 @@ function copy_ops(){
|
|||||||
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
||||||
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
|
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
|
||||||
if [ "$is_rocm" = "True" ]; then
|
if [ "$is_rocm" = "True" ]; then
|
||||||
|
DEVICE_TYPE="rocm"
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||||
echo -e "ROCM ops have been copy to fastdeploy"
|
echo -e "ROCM ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
@@ -82,6 +81,7 @@ function copy_ops(){
|
|||||||
mkdir -p ../fastdeploy/model_executor/ops/base
|
mkdir -p ../fastdeploy/model_executor/ops/base
|
||||||
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
|
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
|
||||||
if [ "$is_cuda" = "True" ]; then
|
if [ "$is_cuda" = "True" ]; then
|
||||||
|
DEVICE_TYPE="gpu"
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||||
echo -e "BASE and CUDA ops have been copy to fastdeploy"
|
echo -e "BASE and CUDA ops have been copy to fastdeploy"
|
||||||
@@ -90,6 +90,7 @@ function copy_ops(){
|
|||||||
|
|
||||||
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
||||||
if [ "$is_xpu" = "True" ]; then
|
if [ "$is_xpu" = "True" ]; then
|
||||||
|
DEVICE_TYPE="xpu"
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
|
||||||
echo -e "xpu ops have been copy to fastdeploy"
|
echo -e "xpu ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
@@ -97,20 +98,14 @@ function copy_ops(){
|
|||||||
|
|
||||||
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
|
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
|
||||||
if [ "$is_npu" = "True" ]; then
|
if [ "$is_npu" = "True" ]; then
|
||||||
|
DEVICE_TYPE="npu"
|
||||||
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
|
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
|
||||||
echo -e "npu ops have been copy to fastdeploy"
|
echo -e "npu ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
DEVICE_TYPE="cpu"
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
||||||
cd ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/xFasterTransformer/build/
|
|
||||||
for file in *_pd_.so; do
|
|
||||||
mv "$file" "${file/_pd_/}"
|
|
||||||
done
|
|
||||||
cd ../../x86-simd-sort/builddir/
|
|
||||||
for file in *_pd_.so; do
|
|
||||||
mv "$file" "${file/_pd_/}"
|
|
||||||
done
|
|
||||||
cd ../../../../
|
cd ../../../../
|
||||||
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
|
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
|
||||||
echo -e "BASE and CPU ops have been copy to fastdeploy"
|
echo -e "BASE and CPU ops have been copy to fastdeploy"
|
||||||
@@ -122,15 +117,30 @@ function build_and_install_ops() {
|
|||||||
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
|
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
|
||||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
|
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
|
||||||
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
|
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
|
||||||
|
find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
|
||||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
|
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
|
||||||
if [ "$CPU_USE_BF16" == "true" ]; then
|
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
|
||||||
CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
||||||
:
|
if [ "$is_xpu" = "True" ]; then
|
||||||
elif [ "$CPU_USE_BF16" == "false" ]; then
|
cd xpu_ops/src
|
||||||
|
bash build.sh ${TMP_DIR_REAL_PATH}
|
||||||
|
cd ../..
|
||||||
|
elif [ "$FD_CPU_USE_BF16" == "true" ]; then
|
||||||
|
if [ "$FD_BUILDING_ARCS" == "" ]; then
|
||||||
|
FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||||
|
else
|
||||||
|
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||||
|
fi
|
||||||
|
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||||
|
elif [ "$FD_CPU_USE_BF16" == "false" ]; then
|
||||||
|
if [ "$FD_BUILDING_ARCS" == "" ]; then
|
||||||
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||||
:
|
else
|
||||||
|
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||||
|
fi
|
||||||
|
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||||
else
|
else
|
||||||
echo "Error: Invalid parameter '$CPU_USE_BF16'. Please use true or false."
|
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
@@ -146,11 +156,7 @@ function build_and_install_ops() {
|
|||||||
|
|
||||||
function build_and_install() {
|
function build_and_install() {
|
||||||
echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
|
echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
|
||||||
if [ "$BUILDING_ARCS" == "" ]; then
|
${python} setup.py bdist_wheel --python-tag=py3
|
||||||
${python} setup.py bdist_wheel --python-tag py3
|
|
||||||
else
|
|
||||||
BUILDING_ARCS=${BUILDING_ARCS} ${python} setup.py bdist_wheel --python-tag py3
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
|
echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
|
||||||
@@ -174,10 +180,12 @@ function cleanup() {
|
|||||||
rm -rf $BUILD_DIR $EGG_DIR
|
rm -rf $BUILD_DIR $EGG_DIR
|
||||||
if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0 ]; then
|
if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0 ]; then
|
||||||
echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
|
echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
|
||||||
${python} -m pip uninstall -y fastdeploy
|
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
||||||
|
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
|
||||||
|
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
|
||||||
}
|
}
|
||||||
|
|
||||||
function abort() {
|
function abort() {
|
||||||
@@ -187,7 +195,7 @@ function abort() {
|
|||||||
cur_dir=`basename "$pwd"`
|
cur_dir=`basename "$pwd"`
|
||||||
|
|
||||||
rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
|
rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
|
||||||
${python} -m pip uninstall -y fastdeploy
|
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
|
||||||
|
|
||||||
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
||||||
}
|
}
|
||||||
|
643
custom_ops/0001-DeepGEMM-95e81b3.patch
Normal file
643
custom_ops/0001-DeepGEMM-95e81b3.patch
Normal file
@@ -0,0 +1,643 @@
|
|||||||
|
From 5112002c155dceecc5e5983cdb67157e4f5400e2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: minghaipeng <minghaipeng@baidu.com>
|
||||||
|
Date: Wed, 25 Jun 2025 15:05:24 +0800
|
||||||
|
Subject: [PATCH] DeepGEMM 95e81b3
|
||||||
|
|
||||||
|
---
|
||||||
|
deep_gemm/__init__.py | 2 +-
|
||||||
|
deep_gemm/include/deep_gemm/scheduler.cuh | 2 +-
|
||||||
|
deep_gemm/jit/compiler.py | 2 +-
|
||||||
|
deep_gemm/jit/interleave_ffma.py | 2 +-
|
||||||
|
deep_gemm/jit/runtime.py | 4 +-
|
||||||
|
deep_gemm/jit/template.py | 34 ++++----
|
||||||
|
deep_gemm/jit_kernels/gemm.py | 44 +++++------
|
||||||
|
deep_gemm/jit_kernels/m_grouped_gemm.py | 96 +++++++++++------------
|
||||||
|
deep_gemm/jit_kernels/tuner.py | 10 +--
|
||||||
|
deep_gemm/jit_kernels/utils.py | 18 +++--
|
||||||
|
deep_gemm/paddle_utils.py | 20 +++++
|
||||||
|
deep_gemm/utils.py | 30 +++----
|
||||||
|
12 files changed, 143 insertions(+), 121 deletions(-)
|
||||||
|
create mode 100644 deep_gemm/paddle_utils.py
|
||||||
|
|
||||||
|
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
|
||||||
|
index 15b22ca..63e7fb7 100644
|
||||||
|
--- a/deep_gemm/__init__.py
|
||||||
|
+++ b/deep_gemm/__init__.py
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
|
||||||
|
from . import jit
|
||||||
|
from .jit_kernels import (
|
||||||
|
diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||||
|
index 9743871..6c97152 100644
|
||||||
|
--- a/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||||
|
+++ b/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||||
|
@@ -102,7 +102,7 @@ struct Scheduler {
|
||||||
|
if constexpr (kGemmType == GemmType::Normal) {
|
||||||
|
return block_idx * block_size;
|
||||||
|
} else if constexpr (kGemmType == GemmType::GroupedContiguous) {
|
||||||
|
- auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
|
||||||
|
+ auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M));
|
||||||
|
return offset * shape_dim + block_idx * block_size;
|
||||||
|
} else if constexpr (kGemmType == GemmType::GroupedMasked) {
|
||||||
|
return curr_group_idx * shape_dim + block_idx * block_size;
|
||||||
|
diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py
|
||||||
|
index c17d466..6fdc52f 100644
|
||||||
|
--- a/deep_gemm/jit/compiler.py
|
||||||
|
+++ b/deep_gemm/jit/compiler.py
|
||||||
|
@@ -4,7 +4,7 @@ import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import uuid
|
||||||
|
-from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
+from ..paddle_utils import CUDA_HOME
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from . import interleave_ffma
|
||||||
|
diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/interleave_ffma.py
|
||||||
|
index fcb377e..db9d6f3 100644
|
||||||
|
--- a/deep_gemm/jit/interleave_ffma.py
|
||||||
|
+++ b/deep_gemm/jit/interleave_ffma.py
|
||||||
|
@@ -3,7 +3,7 @@ import mmap
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
-from torch.utils.cpp_extension import CUDA_HOME
|
||||||
|
+from ..paddle_utils import CUDA_HOME
|
||||||
|
|
||||||
|
|
||||||
|
def run_cuobjdump(file_path):
|
||||||
|
diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py
|
||||||
|
index 66c370a..4761426 100644
|
||||||
|
--- a/deep_gemm/jit/runtime.py
|
||||||
|
+++ b/deep_gemm/jit/runtime.py
|
||||||
|
@@ -1,6 +1,6 @@
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .template import map_ctype
|
||||||
|
@@ -35,7 +35,7 @@ class Runtime:
|
||||||
|
assert len(args) == len(self.args), f'Expected {len(self.args)} arguments, got {len(args)}'
|
||||||
|
cargs = []
|
||||||
|
for arg, (name, dtype) in zip(args, self.args):
|
||||||
|
- if isinstance(arg, torch.Tensor):
|
||||||
|
+ if isinstance(arg, paddle.Tensor):
|
||||||
|
assert arg.dtype == dtype, f'Expected tensor dtype `{dtype}` for `{name}`, got `{arg.dtype}`'
|
||||||
|
else:
|
||||||
|
assert isinstance(arg, dtype), f'Expected built-in type `{dtype}` for `{name}`, got `{type(arg)}`'
|
||||||
|
diff --git a/deep_gemm/jit/template.py b/deep_gemm/jit/template.py
|
||||||
|
index ead37f5..51b02c1 100644
|
||||||
|
--- a/deep_gemm/jit/template.py
|
||||||
|
+++ b/deep_gemm/jit/template.py
|
||||||
|
@@ -1,24 +1,24 @@
|
||||||
|
import copy
|
||||||
|
import ctypes
|
||||||
|
import os
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
from typing import Any, Dict, Iterable, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
# Name map for Python `eval`
|
||||||
|
typename_map: Dict[Any, str] = {
|
||||||
|
**{t: t.__name__ for t in (bool, int, float)},
|
||||||
|
- torch.int: 'torch.int',
|
||||||
|
- torch.float: 'torch.float',
|
||||||
|
- torch.bfloat16: 'torch.bfloat16',
|
||||||
|
- torch.float8_e4m3fn: 'torch.float8_e4m3fn',
|
||||||
|
- torch.cuda.Stream: 'torch.cuda.Stream',
|
||||||
|
+ paddle.int32: 'paddle.int32',
|
||||||
|
+ paddle.float32: 'paddle.float32',
|
||||||
|
+ paddle.bfloat16: 'paddle.bfloat16',
|
||||||
|
+ paddle.float8_e4m3fn: 'paddle.float8_e4m3fn',
|
||||||
|
+ paddle.device.cuda.Stream: "paddle.device.cuda.Stream",
|
||||||
|
}
|
||||||
|
|
||||||
|
# `ctype` map for Python casting
|
||||||
|
ctype_map: Dict[Any, Any] = {
|
||||||
|
**{t: getattr(ctypes, f'c_{t.__name__}') for t in (bool, int, float)},
|
||||||
|
- **{t: ctypes.c_void_p for t in (torch.int, torch.float, torch.bfloat16, torch.float8_e4m3fn, torch.cuda.Stream)},
|
||||||
|
+ **{t: ctypes.c_void_p for t in (paddle.int32, paddle.float32, paddle.bfloat16, paddle.float8_e4m3fn, paddle.device.cuda.Stream)},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -27,25 +27,25 @@ genc_map = {
|
||||||
|
bool: ('bool', 'bool'),
|
||||||
|
int: ('int', 'int'),
|
||||||
|
float: ('float', 'float'),
|
||||||
|
- torch.int: ('void*', 'int*'),
|
||||||
|
- torch.float: ('void*', 'float*'),
|
||||||
|
- torch.bfloat16: ('void*', '__nv_bfloat16*'),
|
||||||
|
- torch.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
|
||||||
|
- torch.cuda.Stream: ('void*', 'cudaStream_t'),
|
||||||
|
+ paddle.int32: ('void*', 'int*'),
|
||||||
|
+ paddle.float32: ('void*', 'float*'),
|
||||||
|
+ paddle.bfloat16: ('void*', '__nv_bfloat16*'),
|
||||||
|
+ paddle.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
|
||||||
|
+ paddle.device.cuda.Stream: ('void*', 'cudaStream_t'),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def map_ctype(value: Any) -> Any:
|
||||||
|
if hasattr(value, 'data_ptr'):
|
||||||
|
- if value.dtype == torch.int:
|
||||||
|
+ if value.dtype == paddle.int32:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
- elif value.dtype == torch.float:
|
||||||
|
+ elif value.dtype == paddle.float32:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
- elif value.dtype == torch.bfloat16:
|
||||||
|
+ elif value.dtype == paddle.bfloat16:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
- elif value.dtype == torch.float16:
|
||||||
|
+ elif value.dtype == paddle.float16:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
- elif value.dtype == torch.float8_e4m3fn:
|
||||||
|
+ elif value.dtype == paddle.float8_e4m3fn:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
else:
|
||||||
|
return ctypes.c_void_p(value.data_ptr())
|
||||||
|
diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py
|
||||||
|
index cb438b7..44aa0ed 100644
|
||||||
|
--- a/deep_gemm/jit_kernels/gemm.py
|
||||||
|
+++ b/deep_gemm/jit_kernels/gemm.py
|
||||||
|
@@ -1,5 +1,5 @@
|
||||||
|
import math
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
@@ -166,20 +166,20 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
|
||||||
|
return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config
|
||||||
|
|
||||||
|
|
||||||
|
-def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- out: torch.Tensor) -> None:
|
||||||
|
+def gemm_fp8_fp8_bf16_nt(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ out: paddle.Tensor) -> None:
|
||||||
|
"""
|
||||||
|
Do a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||||
|
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||||
|
RHS and RHS scaling factors are required to be transposed.
|
||||||
|
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||||
|
- this function will do a transposing with a set of slow PyTorch operations.
|
||||||
|
+ this function will do a transposing with a set of slow paddle operations.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
|
||||||
|
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m, k]`,
|
||||||
|
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
|
||||||
|
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
|
||||||
|
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[n, k]`.
|
||||||
|
the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||||
|
out: the BF16 output tensor of shape `[m, n]`, representing the result.
|
||||||
|
"""
|
||||||
|
@@ -189,22 +189,22 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
n, k_ = rhs.shape
|
||||||
|
m_, n_ = out.shape
|
||||||
|
|
||||||
|
- assert n % 64 == 0 and k % 128 == 0
|
||||||
|
+ # assert n % 64 == 0 and k % 128 == 0
|
||||||
|
|
||||||
|
# Type and shape checks
|
||||||
|
- assert m == m_ and n == n_ and k == k_
|
||||||
|
- assert n > 0 and k > 0
|
||||||
|
- assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||||
|
- assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
|
||||||
|
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||||
|
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||||
|
- assert out.dtype == torch.bfloat16
|
||||||
|
- assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
|
||||||
|
+ # assert m == m_ and n == n_ and k == k_
|
||||||
|
+ # assert n > 0 and k > 0
|
||||||
|
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||||
|
+ # assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
|
||||||
|
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert out.dtype == paddle.bfloat16
|
||||||
|
+ # assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
|
||||||
|
|
||||||
|
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||||
|
# NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
|
||||||
|
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||||
|
- assert rhs_scales.is_contiguous()
|
||||||
|
+ # assert rhs_scales.is_contiguous()
|
||||||
|
|
||||||
|
# Do nothing if `m` is zero
|
||||||
|
if m == 0:
|
||||||
|
@@ -214,7 +214,7 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
global includes, template
|
||||||
|
num_sms = get_num_sms()
|
||||||
|
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms)
|
||||||
|
- args = (lhs, lhs_scales, rhs, rhs_scales, out, m, torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
+ args = (lhs, lhs_scales, rhs, rhs_scales, out, m, paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
runtime = jit_tuner.compile_and_tune(
|
||||||
|
name='gemm_fp8_fp8_bf16_nt',
|
||||||
|
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||||
|
@@ -225,10 +225,10 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
|
||||||
|
space=(),
|
||||||
|
includes=includes,
|
||||||
|
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||||
|
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||||
|
- ('out', torch.bfloat16), ('m', int),
|
||||||
|
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||||
|
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||||
|
+ ('out', paddle.bfloat16), ('m', int),
|
||||||
|
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
template=template,
|
||||||
|
args=args
|
||||||
|
)
|
||||||
|
diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||||
|
index 3b518c9..ba776bd 100644
|
||||||
|
--- a/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||||
|
+++ b/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from .gemm import get_best_configs, get_block_n_padding_for_smem_d
|
||||||
|
@@ -37,25 +37,25 @@ gemm_t::run(out, rhs_scales, grouped_layout,
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
-def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- out: torch.Tensor, m_indices: torch.Tensor) -> None:
|
||||||
|
+def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ out: paddle.Tensor, m_indices: paddle.Tensor) -> None:
|
||||||
|
"""
|
||||||
|
Do a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||||
|
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||||
|
RHS and RHS scaling factors are required to be transposed.
|
||||||
|
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||||
|
- this function will do a transposing with a set of slow PyTorch operations.
|
||||||
|
+ this function will do a transposing with a set of slow Pypaddle operations.
|
||||||
|
On the M axis, inputs are grouped into several batches, of which batch sizes aligned to
|
||||||
|
`get_m_alignment_for_contiguous_layout()` (128).
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`,
|
||||||
|
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m_sum, k]`,
|
||||||
|
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`.
|
||||||
|
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||||
|
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||||
|
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||||
|
out: the BF16 output tensor of shape `[m_sum, n]`, representing the result.
|
||||||
|
- m_indices: a tensor of shape `[m_sum]` with type `torch.int`.
|
||||||
|
+ m_indices: a tensor of shape `[m_sum]` with type `paddle.int`.
|
||||||
|
`m_indices[i]` records the group which the i-th row of the LHS belong to,
|
||||||
|
which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`.
|
||||||
|
Values of `m_indices` in every-m-alignment-block must also be the same.
|
||||||
|
@@ -68,19 +68,19 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||||
|
m__ = m_indices.numel()
|
||||||
|
|
||||||
|
# Type and shape checks
|
||||||
|
- assert m == m_ == m__ and k == k_ and n == n_
|
||||||
|
- assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||||
|
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||||
|
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||||
|
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||||
|
- assert out.dtype == torch.bfloat16
|
||||||
|
- assert m_indices.dtype == torch.int32
|
||||||
|
- assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||||
|
- assert out.is_contiguous() and m_indices.is_contiguous()
|
||||||
|
+ # assert m == m_ == m__ and k == k_ and n == n_
|
||||||
|
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||||
|
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||||
|
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert out.dtype == paddle.bfloat16
|
||||||
|
+ # assert m_indices.dtype == paddle.int32
|
||||||
|
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||||
|
+ # assert out.is_contiguous() and m_indices.is_contiguous()
|
||||||
|
|
||||||
|
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||||
|
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||||
|
- assert rhs_scales.is_contiguous()
|
||||||
|
+ # assert rhs_scales.is_contiguous()
|
||||||
|
|
||||||
|
# Do nothing if `m` is zero
|
||||||
|
if m == 0:
|
||||||
|
@@ -92,7 +92,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||||
|
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms, is_grouped_contiguous=True)
|
||||||
|
args = (lhs, lhs_scales, rhs, rhs_scales, out,
|
||||||
|
m_indices, m, num_groups,
|
||||||
|
- torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
runtime = jit_tuner.compile_and_tune(
|
||||||
|
name='m_grouped_gemm_fp8_fp8_bf16_nt',
|
||||||
|
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||||
|
@@ -105,11 +105,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||||
|
'GEMM_TYPE': 'GroupedContiguous'},
|
||||||
|
space=(),
|
||||||
|
includes=includes,
|
||||||
|
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||||
|
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||||
|
- ('out', torch.bfloat16),
|
||||||
|
- ('grouped_layout', torch.int32), ('m', int), ('num_groups', int),
|
||||||
|
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||||
|
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||||
|
+ ('out', paddle.bfloat16),
|
||||||
|
+ ('grouped_layout', paddle.int32), ('m', int), ('num_groups', int),
|
||||||
|
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
template=template,
|
||||||
|
args=args
|
||||||
|
)
|
||||||
|
@@ -118,22 +118,22 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||||
|
runtime(*args)
|
||||||
|
|
||||||
|
|
||||||
|
-def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||||
|
- out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None:
|
||||||
|
+def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||||
|
+ out: paddle.Tensor, masked_m: paddle.Tensor, expected_m: int) -> None:
|
||||||
|
"""
|
||||||
|
Do a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||||
|
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||||
|
RHS and RHS scaling factors are required to be transposed.
|
||||||
|
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||||
|
- this function will do a transposing with a set of slow PyTorch operations.
|
||||||
|
+ this function will do a transposing with a set of slow paddle operations.
|
||||||
|
Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch
|
||||||
|
should be separately transposed.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
|
||||||
|
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
|
||||||
|
the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`.
|
||||||
|
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||||
|
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||||
|
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||||
|
out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result.
|
||||||
|
masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute
|
||||||
|
@@ -149,21 +149,21 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
num_groups___ = masked_m.numel()
|
||||||
|
|
||||||
|
# Type and shape checks
|
||||||
|
- assert num_groups == num_groups_ == num_groups__ == num_groups___
|
||||||
|
- assert m == m_ and n == n_ and k == k_
|
||||||
|
- assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
|
||||||
|
- assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
|
||||||
|
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||||
|
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||||
|
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||||
|
- assert out.dtype == torch.bfloat16
|
||||||
|
- assert masked_m.dtype == torch.int32
|
||||||
|
- assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||||
|
- assert out.is_contiguous() and masked_m.is_contiguous()
|
||||||
|
+ # assert num_groups == num_groups_ == num_groups__ == num_groups___
|
||||||
|
+ # assert m == m_ and n == n_ and k == k_
|
||||||
|
+ # assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
|
||||||
|
+ # assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
|
||||||
|
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||||
|
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||||
|
+ # assert out.dtype == paddle.bfloat16
|
||||||
|
+ # assert masked_m.dtype == paddle.int32
|
||||||
|
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||||
|
+ # assert out.is_contiguous() and masked_m.is_contiguous()
|
||||||
|
|
||||||
|
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||||
|
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||||
|
- assert rhs_scales.is_contiguous()
|
||||||
|
+ # assert rhs_scales.is_contiguous()
|
||||||
|
|
||||||
|
# Auto-tuning with compilation
|
||||||
|
global includes, template
|
||||||
|
@@ -176,7 +176,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
|
||||||
|
args = (lhs, lhs_scales, rhs, rhs_scales, out,
|
||||||
|
masked_m, m,
|
||||||
|
- torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||||
|
runtime = jit_tuner.compile_and_tune(
|
||||||
|
name='m_grouped_gemm_fp8_fp8_bf16_nt',
|
||||||
|
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||||
|
@@ -189,11 +189,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||||
|
'GEMM_TYPE': 'GroupedMasked'},
|
||||||
|
space=(),
|
||||||
|
includes=includes,
|
||||||
|
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||||
|
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||||
|
- ('out', torch.bfloat16),
|
||||||
|
- ('grouped_layout', torch.int32), ('m', int),
|
||||||
|
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||||
|
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||||
|
+ ('out', paddle.bfloat16),
|
||||||
|
+ ('grouped_layout', paddle.int32), ('m', int),
|
||||||
|
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||||
|
template=template,
|
||||||
|
args=args
|
||||||
|
)
|
||||||
|
diff --git a/deep_gemm/jit_kernels/tuner.py b/deep_gemm/jit_kernels/tuner.py
|
||||||
|
index 6ed6749..9e1d70f 100644
|
||||||
|
--- a/deep_gemm/jit_kernels/tuner.py
|
||||||
|
+++ b/deep_gemm/jit_kernels/tuner.py
|
||||||
|
@@ -1,6 +1,6 @@
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from ..jit import build, cpp_format, generate, Runtime
|
||||||
|
@@ -51,10 +51,10 @@ class JITTuner:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
|
||||||
|
- start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
- end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
- torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
|
||||||
|
- torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||||
|
+ start_event = paddle.device.cuda.Event(enable_timing=True)
|
||||||
|
+ end_event = paddle.device.cuda.Event(enable_timing=True)
|
||||||
|
+ paddle.empty((int(256e6 // 4)), dtype=paddle.int32).zero_()
|
||||||
|
+ paddle.randn((8192, 8192), dtype=paddle.float32) @ paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||||
|
start_event.record()
|
||||||
|
for i in range(20):
|
||||||
|
assert runtime(*args) == 0
|
||||||
|
diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py
|
||||||
|
index c6da56b..a17b1b1 100644
|
||||||
|
--- a/deep_gemm/jit_kernels/utils.py
|
||||||
|
+++ b/deep_gemm/jit_kernels/utils.py
|
||||||
|
@@ -1,4 +1,4 @@
|
||||||
|
-import torch
|
||||||
|
+import paddle
|
||||||
|
|
||||||
|
_num_sms = None
|
||||||
|
|
||||||
|
@@ -11,7 +11,7 @@ def set_num_sms(num_sms: int) -> None:
|
||||||
|
num_sms: the desired maximum SM count for all GEMM kernels to use.
|
||||||
|
"""
|
||||||
|
global _num_sms
|
||||||
|
- assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count
|
||||||
|
+ assert 0 < num_sms <= paddle.device.cuda.get_device_properties().multi_processor_count
|
||||||
|
_num_sms = num_sms
|
||||||
|
|
||||||
|
|
||||||
|
@@ -25,7 +25,7 @@ def get_num_sms() -> int:
|
||||||
|
"""
|
||||||
|
global _num_sms
|
||||||
|
if _num_sms is None:
|
||||||
|
- _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
|
||||||
|
+ _num_sms = paddle.device.cuda.get_device_properties().multi_processor_count
|
||||||
|
return _num_sms
|
||||||
|
|
||||||
|
|
||||||
|
@@ -74,9 +74,9 @@ def get_tma_aligned_size(x: int, element_size: int) -> int:
|
||||||
|
return ceil_div(x, alignment) * alignment
|
||||||
|
|
||||||
|
|
||||||
|
-def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
|
||||||
|
+def get_col_major_tma_aligned_tensor(x: paddle.Tensor) -> paddle.Tensor:
|
||||||
|
"""
|
||||||
|
- Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary.
|
||||||
|
+ Returns TMA-aligned transposed format of the input tensor. `paddle.transpose` will be called if necessary.
|
||||||
|
If the input tensor is already column-major layout and 16-byte aligned along the M axis
|
||||||
|
(thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing.
|
||||||
|
|
||||||
|
@@ -92,18 +92,20 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
|
||||||
|
m, n = x.shape[-2], x.shape[-1]
|
||||||
|
aligned_m = get_tma_aligned_size(m, x.element_size())
|
||||||
|
if x.dim() == 2:
|
||||||
|
- if x.stride(0) == 1 and x.stride(1) == aligned_m:
|
||||||
|
+ if x.strides[0] == 1 and x.strides[1] == aligned_m:
|
||||||
|
return x
|
||||||
|
x, remove_dim = x.unsqueeze(0), True
|
||||||
|
|
||||||
|
b = x.shape[0]
|
||||||
|
|
||||||
|
# The last kernel gives a column-major TMA aligned layout
|
||||||
|
- if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m:
|
||||||
|
+ if x.strides[0] == aligned_m * n and x.strides[1] == 1 and x.strides[2] == aligned_m:
|
||||||
|
return x.squeeze(0) if remove_dim else x
|
||||||
|
|
||||||
|
# Normal layout requires transposing
|
||||||
|
- aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
|
||||||
|
+ aligned_x = paddle.transpose(
|
||||||
|
+ paddle.empty((b, n, aligned_m), dtype=x.dtype), perm=[0, 2, 1]
|
||||||
|
+ )
|
||||||
|
aligned_x[:, :m, :] = x
|
||||||
|
aligned_x = aligned_x[:, :m, :]
|
||||||
|
return aligned_x.squeeze(0) if remove_dim else aligned_x
|
||||||
|
diff --git a/deep_gemm/paddle_utils.py b/deep_gemm/paddle_utils.py
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..2326807
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/deep_gemm/paddle_utils.py
|
||||||
|
@@ -0,0 +1,20 @@
|
||||||
|
+import os
|
||||||
|
+
|
||||||
|
+def get_cuda_home():
|
||||||
|
+ """Get Cuda home directory"""
|
||||||
|
+ cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
|
||||||
|
+ if cuda_home:
|
||||||
|
+ return cuda_home
|
||||||
|
+
|
||||||
|
+ try:
|
||||||
|
+ which_cmd = "which nvcc"
|
||||||
|
+
|
||||||
|
+ nvcc_path = os.popen(which_cmd).read().strip()
|
||||||
|
+ if nvcc_path:
|
||||||
|
+ return os.path.dirname(os.path.dirname(nvcc_path))
|
||||||
|
+ except Exception:
|
||||||
|
+ pass
|
||||||
|
+
|
||||||
|
+ return None
|
||||||
|
+
|
||||||
|
+CUDA_HOME = get_cuda_home()
|
||||||
|
\ No newline at end of file
|
||||||
|
diff --git a/deep_gemm/utils.py b/deep_gemm/utils.py
|
||||||
|
index d5cdd01..5237f09 100644
|
||||||
|
--- a/deep_gemm/utils.py
|
||||||
|
+++ b/deep_gemm/utils.py
|
||||||
|
@@ -1,15 +1,15 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
-import torch
|
||||||
|
-import torch.distributed as dist
|
||||||
|
+import paddle
|
||||||
|
+import paddle.distributed as dist
|
||||||
|
|
||||||
|
|
||||||
|
def bench(fn, num_warmups: int = 5, num_tests: int = 10,
|
||||||
|
high_precision: bool = False):
|
||||||
|
# Flush L2 cache with 256 MB data
|
||||||
|
- torch.cuda.synchronize()
|
||||||
|
- cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
|
||||||
|
+ paddle.device.cuda.synchronize()
|
||||||
|
+ cache = paddle.empty((int(256e6 // 4)), dtype=paddle.int32)
|
||||||
|
cache.zero_()
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
@@ -18,18 +18,18 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
|
||||||
|
|
||||||
|
# Add a large kernel to eliminate the CPU launch overhead
|
||||||
|
if high_precision:
|
||||||
|
- x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||||
|
- y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||||
|
+ x = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||||
|
+ y = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||||
|
x @ y
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
- start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
- end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
+ start_event = paddle.device.cuda.Event(enable_timing=True)
|
||||||
|
+ end_event = paddle.device.cuda.Event(enable_timing=True)
|
||||||
|
start_event.record()
|
||||||
|
for i in range(num_tests):
|
||||||
|
fn()
|
||||||
|
end_event.record()
|
||||||
|
- torch.cuda.synchronize()
|
||||||
|
+ paddle.device.synchronize()
|
||||||
|
|
||||||
|
return start_event.elapsed_time(end_event) / num_tests
|
||||||
|
|
||||||
|
@@ -106,21 +106,21 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
|
||||||
|
# Profile
|
||||||
|
suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
|
||||||
|
with suppress():
|
||||||
|
- schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
|
||||||
|
- profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
|
||||||
|
+ scheduler = paddle.profiler.make_scheduler(closed=0, ready=1, record=1, repeat=1) if not using_nsys else None
|
||||||
|
+ profiler = paddle.profiler.Profiler(targets=[paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU], scheduler=scheduler) if not using_nsys else empty_suppress()
|
||||||
|
with profiler:
|
||||||
|
for i in range(2):
|
||||||
|
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
|
||||||
|
if barrier_comm_profiling:
|
||||||
|
- lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||||
|
- rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||||
|
+ lhs = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||||
|
+ rhs = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||||
|
lhs @ rhs
|
||||||
|
- dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
|
||||||
|
+ dist.all_reduce(paddle.ones(1, dtype=paddle.float32))
|
||||||
|
for _ in range(num_tests):
|
||||||
|
if sleep_between_tests > 0.0:
|
||||||
|
time.sleep(sleep_between_tests)
|
||||||
|
if flush_l2:
|
||||||
|
- torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
|
||||||
|
+ paddle.empty(flush_l2_size, dtype=paddle.int32).zero_()
|
||||||
|
fn()
|
||||||
|
|
||||||
|
if not using_nsys:
|
||||||
|
--
|
||||||
|
2.43.0
|
||||||
|
|
@@ -1,188 +0,0 @@
|
|||||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
#include "dtype.h"
|
|
||||||
#include "matmul_helper.h"
|
|
||||||
#include "my_types.h"
|
|
||||||
#include "paddle/extension.h"
|
|
||||||
#include "paddle/phi/core/kernel_registry.h"
|
|
||||||
template <typename T>
|
|
||||||
void AvxCompute(const paddle::Tensor &x,
|
|
||||||
const paddle::Tensor &weight,
|
|
||||||
const paddle::Tensor &w_bias,
|
|
||||||
bool trans,
|
|
||||||
const std::string alog,
|
|
||||||
paddle::Tensor &out,
|
|
||||||
xft::Matrix<T> &quantizedWeight,
|
|
||||||
xft::Vector<float> &WeightScale,
|
|
||||||
xft::Vector<float> &WeightZero,
|
|
||||||
xft::Vector<float> &WeightSum,
|
|
||||||
MMHelper *mmHelper) {
|
|
||||||
auto out_data = out.data<float>();
|
|
||||||
const float *x_data = reinterpret_cast<const float *>(x.data<float>());
|
|
||||||
const float *bias_data = nullptr;
|
|
||||||
if (w_bias.initialized()) {
|
|
||||||
bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
|
|
||||||
}
|
|
||||||
int m = 1;
|
|
||||||
for (int i = 0; i < x.shape().size() - 1; i++) {
|
|
||||||
m = m * x.shape()[i];
|
|
||||||
}
|
|
||||||
int k = x.shape()[x.shape().size() - 1];
|
|
||||||
int l = weight.shape()[1];
|
|
||||||
int n = weight.shape()[1];
|
|
||||||
if (w_bias.initialized()) {
|
|
||||||
mmHelper->compute_bias(false,
|
|
||||||
m,
|
|
||||||
n,
|
|
||||||
k,
|
|
||||||
1.0f,
|
|
||||||
x_data,
|
|
||||||
k,
|
|
||||||
quantizedWeight.Data(),
|
|
||||||
WeightScale.Data(),
|
|
||||||
WeightZero.Data(),
|
|
||||||
WeightSum.Data(),
|
|
||||||
0.0f,
|
|
||||||
out_data,
|
|
||||||
l,
|
|
||||||
bias_data);
|
|
||||||
} else {
|
|
||||||
mmHelper->compute(false,
|
|
||||||
m,
|
|
||||||
n,
|
|
||||||
k,
|
|
||||||
1.0f,
|
|
||||||
x_data,
|
|
||||||
k,
|
|
||||||
quantizedWeight.Data(),
|
|
||||||
WeightScale.Data(),
|
|
||||||
WeightZero.Data(),
|
|
||||||
WeightSum.Data(),
|
|
||||||
0.0,
|
|
||||||
out_data,
|
|
||||||
l);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template <typename T>
|
|
||||||
void AvxWeightOnly(const paddle::Tensor &x,
|
|
||||||
const paddle::Tensor &weight,
|
|
||||||
const paddle::Tensor &w_bias,
|
|
||||||
bool trans,
|
|
||||||
const std::string alog,
|
|
||||||
paddle::Tensor &out) {
|
|
||||||
static std::unordered_map<std::string,
|
|
||||||
std::tuple<xft::Matrix<T> *,
|
|
||||||
xft::Vector<float> *,
|
|
||||||
xft::Vector<float> *,
|
|
||||||
xft::Vector<float> *>>
|
|
||||||
weight_only_hub;
|
|
||||||
std::stringstream weights_addr;
|
|
||||||
weights_addr << weight.data<float>() << alog;
|
|
||||||
std::string weight_only_key = weights_addr.str();
|
|
||||||
auto it_created = weight_only_hub.find(weight_only_key);
|
|
||||||
static MMHelper *mmHelper;
|
|
||||||
int rows = weight.shape()[0], cols = weight.shape()[1];
|
|
||||||
xft::Vector<float> *WeightScale =
|
|
||||||
new xft::Vector<float>(); // if weight is int8
|
|
||||||
xft::Vector<float> *WeightZero =
|
|
||||||
new xft::Vector<float>(); // if weight is int8
|
|
||||||
xft::Vector<float> *WeightSum =
|
|
||||||
new xft::Vector<float>(); // if weight is int8
|
|
||||||
xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
|
|
||||||
if (it_created == weight_only_hub.end()) {
|
|
||||||
auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
|
|
||||||
xft::Matrix<T> convertedWeight;
|
|
||||||
mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
|
|
||||||
mmHelper->convertWeight(trans,
|
|
||||||
rows,
|
|
||||||
cols,
|
|
||||||
weight_ptr,
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
convertedWeight,
|
|
||||||
*WeightScale,
|
|
||||||
*WeightZero,
|
|
||||||
*WeightSum);
|
|
||||||
quantizedWeight->Resize(rows, cols);
|
|
||||||
mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
|
|
||||||
weight_only_hub[weight_only_key] = std::make_tuple(
|
|
||||||
quantizedWeight, WeightScale, WeightZero, WeightSum);
|
|
||||||
AvxCompute<T>(x,
|
|
||||||
weight,
|
|
||||||
w_bias,
|
|
||||||
trans,
|
|
||||||
alog,
|
|
||||||
out,
|
|
||||||
*quantizedWeight,
|
|
||||||
*WeightScale,
|
|
||||||
*WeightZero,
|
|
||||||
*WeightSum,
|
|
||||||
mmHelper);
|
|
||||||
} else {
|
|
||||||
AvxCompute<T>(x,
|
|
||||||
weight,
|
|
||||||
w_bias,
|
|
||||||
trans,
|
|
||||||
alog,
|
|
||||||
out,
|
|
||||||
*(std::get<0>(it_created->second)),
|
|
||||||
*(std::get<1>(it_created->second)),
|
|
||||||
*(std::get<2>(it_created->second)),
|
|
||||||
*(std::get<3>(it_created->second)),
|
|
||||||
mmHelper);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
|
|
||||||
const paddle::Tensor &weight,
|
|
||||||
const paddle::Tensor &w_bias,
|
|
||||||
const std::string &alog,
|
|
||||||
bool trans) {
|
|
||||||
auto out_shape = x.shape();
|
|
||||||
out_shape[out_shape.size() - 1] = weight.shape()[1];
|
|
||||||
auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
|
|
||||||
if (alog == "int8") {
|
|
||||||
AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
|
|
||||||
} else if (alog == "fp16") {
|
|
||||||
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
|
|
||||||
} else {
|
|
||||||
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
|
|
||||||
}
|
|
||||||
return {out};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
|
|
||||||
std::vector<int64_t> x_shape,
|
|
||||||
std::vector<int64_t> weigh_shape,
|
|
||||||
std::vector<int64_t> weigh_bias_shape) {
|
|
||||||
int m = 1;
|
|
||||||
for (int i = 0; i < x_shape.size() - 1; i++) {
|
|
||||||
m = m * x_shape[i];
|
|
||||||
}
|
|
||||||
return {std::vector<int64_t>{m, weigh_shape[1]}};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
|
|
||||||
paddle::DataType x_dtype,
|
|
||||||
paddle::DataType weight_dtype,
|
|
||||||
paddle::DataType weight_bias_dtype) {
|
|
||||||
return {x_dtype};
|
|
||||||
}
|
|
||||||
|
|
||||||
PD_BUILD_STATIC_OP(avx_weight_only)
|
|
||||||
.Inputs({"x", "weight", "w_bias"})
|
|
||||||
.Outputs({"out"})
|
|
||||||
.Attrs({"alog: std::string", "trans:bool"})
|
|
||||||
.SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
|
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
|
|
||||||
.SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
|
|
268
custom_ops/cpu_ops/rebuild_padding.cc
Normal file
268
custom_ops/cpu_ops/rebuild_padding.cc
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "paddle/extension.h"
|
||||||
|
|
||||||
|
#ifndef PD_BUILD_STATIC_OP
|
||||||
|
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void RebuildPaddingCPUImpl(T *output_data,
|
||||||
|
const T *input_data,
|
||||||
|
const int *cum_offsets_data,
|
||||||
|
const int *seq_len_this_time_data,
|
||||||
|
const int *seq_lens_decoder_data,
|
||||||
|
const int *seq_lens_encoder_data,
|
||||||
|
int max_input_length,
|
||||||
|
int dim_embed,
|
||||||
|
const int elem_nums) {
|
||||||
|
for (int i = 0; i < elem_nums; ++i) {
|
||||||
|
const int bi = i / dim_embed;
|
||||||
|
const int bias_idx = i % dim_embed;
|
||||||
|
int seq_id = 0;
|
||||||
|
|
||||||
|
if (seq_len_this_time_data[bi] == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (seq_lens_encoder_data[bi] > 0) {
|
||||||
|
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||||
|
}
|
||||||
|
const int ori_token_idx =
|
||||||
|
bi * max_input_length - cum_offsets_data[bi] + seq_id;
|
||||||
|
const int src_offset = ori_token_idx * dim_embed + bias_idx;
|
||||||
|
|
||||||
|
output_data[i] = input_data[src_offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void RebuildAppendPaddingCPUImpl(T *output_data,
|
||||||
|
const T *input_data,
|
||||||
|
const int *cum_offsets_data,
|
||||||
|
const int *seq_len_this_time_data,
|
||||||
|
const int *seq_lens_decoder_data,
|
||||||
|
const int *seq_lens_encoder_data,
|
||||||
|
const int *output_padding_offset_data,
|
||||||
|
const int max_input_length,
|
||||||
|
const int dim_embed,
|
||||||
|
const int64_t output_elem_nums) {
|
||||||
|
for (int i = 0; i < output_elem_nums; ++i) {
|
||||||
|
int out_token_id = i / dim_embed;
|
||||||
|
int ori_token_id =
|
||||||
|
out_token_id + output_padding_offset_data[out_token_id];
|
||||||
|
int bi = ori_token_id / max_input_length;
|
||||||
|
if (seq_len_this_time_data[bi] == 0 ||
|
||||||
|
(seq_lens_decoder_data[bi] == 0 &&
|
||||||
|
seq_lens_encoder_data[bi] == 0)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int seq_id = 0;
|
||||||
|
if (seq_lens_encoder_data[bi] > 0) {
|
||||||
|
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||||
|
}
|
||||||
|
int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
|
||||||
|
int bias_idx = i % dim_embed;
|
||||||
|
int src_offset = input_token_id * dim_embed + bias_idx;
|
||||||
|
output_data[i] = input_data[src_offset];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> RebuildPaddingCPU(
|
||||||
|
const paddle::Tensor &tmp_out,
|
||||||
|
const paddle::Tensor &cum_offsets,
|
||||||
|
const paddle::Tensor &seq_len_this_time,
|
||||||
|
const paddle::Tensor &seq_lens_decoder,
|
||||||
|
const paddle::Tensor &seq_lens_encoder,
|
||||||
|
const paddle::optional<paddle::Tensor> &output_padding_offset,
|
||||||
|
int max_input_length) {
|
||||||
|
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
|
||||||
|
auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
|
||||||
|
auto seq_len_this_time_cpu =
|
||||||
|
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
|
||||||
|
auto seq_lens_decoder_cpu =
|
||||||
|
seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
|
||||||
|
auto seq_lens_encoder_cpu =
|
||||||
|
seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
|
||||||
|
paddle::optional<paddle::Tensor> output_padding_offset_cpu;
|
||||||
|
if (output_padding_offset) {
|
||||||
|
output_padding_offset_cpu =
|
||||||
|
output_padding_offset->copy_to(paddle::CPUPlace(), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int token_num = tmp_out_cpu.shape()[0];
|
||||||
|
int dim_embed = tmp_out_cpu.shape()[1];
|
||||||
|
int bsz = cum_offsets_cpu.shape()[0];
|
||||||
|
|
||||||
|
paddle::Tensor out;
|
||||||
|
if (output_padding_offset_cpu) {
|
||||||
|
int need_delete_token_num = 0;
|
||||||
|
for (int i = 0; i < bsz; ++i) {
|
||||||
|
if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
|
||||||
|
need_delete_token_num +=
|
||||||
|
seq_lens_encoder_cpu.data<int>()[i] - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int output_token_num = token_num - need_delete_token_num;
|
||||||
|
out = paddle::full({output_token_num, dim_embed},
|
||||||
|
0,
|
||||||
|
tmp_out_cpu.dtype(),
|
||||||
|
paddle::CPUPlace());
|
||||||
|
} else {
|
||||||
|
out = paddle::full(
|
||||||
|
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
|
||||||
|
}
|
||||||
|
|
||||||
|
const int *cum_offsets_data = cum_offsets_cpu.data<int>();
|
||||||
|
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
|
||||||
|
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
|
||||||
|
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
|
||||||
|
int elem_nums = out.numel();
|
||||||
|
|
||||||
|
if (output_padding_offset_cpu) {
|
||||||
|
const int *output_padding_offset_data =
|
||||||
|
output_padding_offset_cpu->data<int>();
|
||||||
|
switch (tmp_out_cpu.dtype()) {
|
||||||
|
case paddle::DataType::FLOAT32:
|
||||||
|
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
|
||||||
|
tmp_out_cpu.data<float>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
output_padding_offset_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::FLOAT16:
|
||||||
|
RebuildAppendPaddingCPUImpl<paddle::float16>(
|
||||||
|
out.data<paddle::float16>(),
|
||||||
|
tmp_out_cpu.data<paddle::float16>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
output_padding_offset_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::BFLOAT16:
|
||||||
|
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
|
||||||
|
out.data<paddle::bfloat16>(),
|
||||||
|
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
output_padding_offset_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PD_THROW(
|
||||||
|
"Unsupported data type for rebuild_padding_cpu. "
|
||||||
|
"Only float32, float16, and bfloat16 are supported.");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
switch (tmp_out_cpu.dtype()) {
|
||||||
|
case paddle::DataType::FLOAT32:
|
||||||
|
RebuildPaddingCPUImpl<float>(out.data<float>(),
|
||||||
|
tmp_out_cpu.data<float>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::FLOAT16:
|
||||||
|
RebuildPaddingCPUImpl<paddle::float16>(
|
||||||
|
out.data<paddle::float16>(),
|
||||||
|
tmp_out_cpu.data<paddle::float16>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
case paddle::DataType::BFLOAT16:
|
||||||
|
|
||||||
|
RebuildPaddingCPUImpl<paddle::bfloat16>(
|
||||||
|
out.data<paddle::bfloat16>(),
|
||||||
|
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||||
|
cum_offsets_data,
|
||||||
|
seq_len_this_time_data,
|
||||||
|
seq_lens_decoder_data,
|
||||||
|
seq_lens_encoder_data,
|
||||||
|
max_input_length,
|
||||||
|
dim_embed,
|
||||||
|
elem_nums);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
PD_THROW(
|
||||||
|
"Unsupported data type for rebuild_padding_cpu. "
|
||||||
|
"Only float32, float16, and bfloat16 are supported.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {out};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||||
|
const std::vector<int64_t> &tmp_out_shape,
|
||||||
|
const std::vector<int64_t> &cum_offsets_shape,
|
||||||
|
const std::vector<int64_t> &seq_len_this_time_shape,
|
||||||
|
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||||
|
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||||
|
const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
|
||||||
|
int64_t dim_embed = tmp_out_shape[1];
|
||||||
|
if (output_padding_offset_shape) {
|
||||||
|
return {{-1, dim_embed}};
|
||||||
|
} else {
|
||||||
|
int64_t bsz = cum_offsets_shape[0];
|
||||||
|
return {{bsz, dim_embed}};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||||
|
const paddle::DataType &tmp_out_dtype,
|
||||||
|
const paddle::DataType &cum_offsets_dtype,
|
||||||
|
const paddle::DataType &seq_len_this_time_dtype,
|
||||||
|
const paddle::DataType &seq_lens_decoder_dtype,
|
||||||
|
const paddle::DataType &seq_lens_encoder_dtype,
|
||||||
|
const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
|
||||||
|
return {tmp_out_dtype};
|
||||||
|
}
|
||||||
|
|
||||||
|
PD_BUILD_STATIC_OP(rebuild_padding_cpu)
|
||||||
|
.Inputs({"tmp_out",
|
||||||
|
"cum_offsets",
|
||||||
|
"seq_len_this_time",
|
||||||
|
"seq_lens_decoder",
|
||||||
|
"seq_lens_encoder",
|
||||||
|
paddle::Optional("output_padding_offset")})
|
||||||
|
.Outputs({"out"})
|
||||||
|
.Attrs({"max_input_length: int"})
|
||||||
|
.SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
|
||||||
|
.SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
|
||||||
|
.SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));
|
@@ -1,201 +0,0 @@
|
|||||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "layers_decoder.h"
|
|
||||||
#include "paddle/extension.h"
|
|
||||||
#include "paddle/phi/core/kernel_registry.h"
|
|
||||||
|
|
||||||
std::vector<paddle::Tensor> InvokeAllLLaMALayer(
|
|
||||||
const paddle::Tensor &input,
|
|
||||||
const std::vector<paddle::Tensor> &ln1Gamma,
|
|
||||||
const std::vector<paddle::Tensor> &ln1Beta,
|
|
||||||
const std::vector<paddle::Tensor> &qkvWeight,
|
|
||||||
const std::vector<paddle::Tensor> &qkvBiasWeight,
|
|
||||||
const std::vector<paddle::Tensor> &attnOutWeight,
|
|
||||||
const std::vector<paddle::Tensor> &attnOutBias,
|
|
||||||
const std::vector<paddle::Tensor> &ln2Gamma,
|
|
||||||
const std::vector<paddle::Tensor> &ln2Beta,
|
|
||||||
const std::vector<paddle::Tensor> &gateWeight,
|
|
||||||
const std::vector<paddle::Tensor> &gateBias,
|
|
||||||
const std::vector<paddle::Tensor> &upWeight,
|
|
||||||
const std::vector<paddle::Tensor> &upBias,
|
|
||||||
const std::vector<paddle::Tensor> &downWeight,
|
|
||||||
const std::vector<paddle::Tensor> &downBias,
|
|
||||||
const paddle::Tensor &pastSeqLen,
|
|
||||||
const paddle::Tensor ¤tSeqLen,
|
|
||||||
const paddle::Tensor &step,
|
|
||||||
int hiddensize,
|
|
||||||
int totalLayer,
|
|
||||||
const std::string &computeType,
|
|
||||||
const std::string &activation,
|
|
||||||
const std::string &normType,
|
|
||||||
int attHeadDim,
|
|
||||||
int attHeadNum,
|
|
||||||
int kvHeadNum,
|
|
||||||
int maxPositions,
|
|
||||||
int maxPosEmbed,
|
|
||||||
int intermediateSize) {
|
|
||||||
auto out = paddle::empty_like(input);
|
|
||||||
auto batchSize = input.shape()[0];
|
|
||||||
auto inputSeqLen = input.shape()[1];
|
|
||||||
auto past_seq_len = pastSeqLen.data<int64_t>()[0];
|
|
||||||
auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
|
|
||||||
auto step_id = step.data<int64_t>()[0];
|
|
||||||
auto output_ptr = reinterpret_cast<void *>(out.data<float>());
|
|
||||||
auto xft_data_type = xft::DataType::fp16;
|
|
||||||
if (computeType == "bf16") {
|
|
||||||
xft_data_type = xft::DataType::bf16;
|
|
||||||
} else if (computeType == "bf16_int8") {
|
|
||||||
xft_data_type = xft::DataType::bf16_int8;
|
|
||||||
}
|
|
||||||
auto xft_act_type = xft::ActivationType::SILU;
|
|
||||||
if (activation == "relu") {
|
|
||||||
xft_act_type = xft::ActivationType::RELU;
|
|
||||||
} else if (activation == "gelu") {
|
|
||||||
xft_act_type = xft::ActivationType::GELU;
|
|
||||||
} else if (activation == "swiglu") {
|
|
||||||
xft_act_type = xft::ActivationType::SWIGLU;
|
|
||||||
}
|
|
||||||
auto xft_norm_type = xft::NormType::RMS;
|
|
||||||
if (normType == "layernorm") {
|
|
||||||
xft_norm_type = xft::NormType::LN;
|
|
||||||
}
|
|
||||||
auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
|
|
||||||
for (int i = 0; i < totalLayer; ++i) {
|
|
||||||
auto ln1Gamma_ptr =
|
|
||||||
reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
|
|
||||||
auto ln1Beta_ptr =
|
|
||||||
reinterpret_cast<const float *>(ln1Beta[i].data<float>());
|
|
||||||
auto qkvWeight_ptr =
|
|
||||||
reinterpret_cast<const void *>(qkvWeight[i].data<float>());
|
|
||||||
auto qkvBiasWeight_ptr =
|
|
||||||
reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
|
|
||||||
auto attnOutWeight_ptr =
|
|
||||||
reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
|
|
||||||
auto ln2Gamma_ptr =
|
|
||||||
reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
|
|
||||||
auto ln2Beta_ptr =
|
|
||||||
reinterpret_cast<const float *>(ln2Beta[i].data<float>());
|
|
||||||
auto gate_weight_ptr =
|
|
||||||
reinterpret_cast<const void *>(gateWeight[i].data<float>());
|
|
||||||
auto up_weight_ptr =
|
|
||||||
reinterpret_cast<const void *>(upWeight[i].data<float>());
|
|
||||||
auto down_weight_ptr =
|
|
||||||
reinterpret_cast<const void *>(downWeight[i].data<float>());
|
|
||||||
auto gate_bias_ptr =
|
|
||||||
reinterpret_cast<const float *>(gateBias[i].data<float>());
|
|
||||||
auto up_bias_ptr =
|
|
||||||
reinterpret_cast<const float *>(upBias[i].data<float>());
|
|
||||||
auto down_bias_ptr =
|
|
||||||
reinterpret_cast<const float *>(downBias[i].data<float>());
|
|
||||||
auto attnOutBias_ptr =
|
|
||||||
reinterpret_cast<const float *>(attnOutBias[i].data<float>());
|
|
||||||
invokeLayerLLaMA(
|
|
||||||
xft_data_type, // dt
|
|
||||||
xft_act_type, // at
|
|
||||||
xft_norm_type, // nt
|
|
||||||
i, // layerId
|
|
||||||
totalLayer, // totalLayers
|
|
||||||
batchSize, // batchSize
|
|
||||||
inputSeqLen, // inputSeqLen
|
|
||||||
attHeadDim, // attHeadDim
|
|
||||||
attHeadNum, // attHeadNum
|
|
||||||
kvHeadNum, // kvHeadNum
|
|
||||||
maxPositions, // maxPositions
|
|
||||||
maxPosEmbed, // maxPosEmbed
|
|
||||||
past_seq_len, // pastSeqLen
|
|
||||||
cur_seq_len, // currentSeqLen
|
|
||||||
step_id, // step
|
|
||||||
hiddensize, // hiddenSize
|
|
||||||
intermediateSize, // intermediateSize
|
|
||||||
reinterpret_cast<void *>(output_ptr), // output
|
|
||||||
hiddensize, // outputStride
|
|
||||||
input_ptr, // input
|
|
||||||
hiddensize, // inputStride
|
|
||||||
ln1Gamma_ptr, // ln1Gamma
|
|
||||||
ln1Beta_ptr, // ln1Beta
|
|
||||||
qkvWeight_ptr, // queryWeight
|
|
||||||
qkvWeight_ptr + hiddensize, // keyWeight
|
|
||||||
qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim, // valueWeight
|
|
||||||
attnOutWeight_ptr, // attnOutWeight
|
|
||||||
ln2Gamma_ptr, // ln2Gamma
|
|
||||||
ln2Beta_ptr, // ln2Beta
|
|
||||||
gate_weight_ptr,
|
|
||||||
up_weight_ptr,
|
|
||||||
down_weight_ptr,
|
|
||||||
qkvBiasWeight_ptr, // queryBias
|
|
||||||
qkvBiasWeight_ptr + hiddensize, // keyBias
|
|
||||||
qkvBiasWeight_ptr + hiddensize +
|
|
||||||
kvHeadNum * attHeadDim, // valueBias
|
|
||||||
attnOutBias_ptr, // attnOutBias
|
|
||||||
qkvWeight_ptr, // myqkvWeight
|
|
||||||
gate_bias_ptr,
|
|
||||||
up_bias_ptr,
|
|
||||||
down_bias_ptr,
|
|
||||||
qkvBiasWeight_ptr);
|
|
||||||
if (i < totalLayer - 1) {
|
|
||||||
memcpy(const_cast<void *>(input_ptr),
|
|
||||||
output_ptr,
|
|
||||||
batchSize * inputSeqLen * hiddensize * sizeof(float));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return {out};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
|
|
||||||
std::vector<int64_t> x_shape) {
|
|
||||||
return {x_shape};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<paddle::DataType> AllLLaMALayerInferDtype(
|
|
||||||
paddle::DataType x_dtype) {
|
|
||||||
return {x_dtype};
|
|
||||||
}
|
|
||||||
|
|
||||||
PD_BUILD_STATIC_OP(xft_llama_all_layer)
|
|
||||||
.Inputs({
|
|
||||||
"x",
|
|
||||||
paddle::Vec("ln1Gamma"),
|
|
||||||
paddle::Vec("ln1Beta"),
|
|
||||||
paddle::Vec("qkvWeight"),
|
|
||||||
paddle::Vec("qkvBiasWeight"),
|
|
||||||
paddle::Vec("attnOutWeight"),
|
|
||||||
paddle::Vec("attnOutBias"),
|
|
||||||
paddle::Vec("ln2Gamma"),
|
|
||||||
paddle::Vec("ln2Beta"),
|
|
||||||
paddle::Vec("gateWeight"),
|
|
||||||
paddle::Vec("gateBias"),
|
|
||||||
paddle::Vec("upWeight"),
|
|
||||||
paddle::Vec("upBias"),
|
|
||||||
paddle::Vec("downWeight"),
|
|
||||||
paddle::Vec("downBias"),
|
|
||||||
"pastSeqLen",
|
|
||||||
"currentSeqLen",
|
|
||||||
"step",
|
|
||||||
})
|
|
||||||
.Outputs({"out"})
|
|
||||||
.Attrs({"hiddensize :int",
|
|
||||||
"totalLayer :int",
|
|
||||||
"computeType : std::string",
|
|
||||||
"activation :std::string",
|
|
||||||
"normType :std::string",
|
|
||||||
"attHeadDim: int",
|
|
||||||
"attHeadNum: int",
|
|
||||||
"kvHeadNum: int",
|
|
||||||
"maxPositions: int",
|
|
||||||
"maxPosEmbed: int",
|
|
||||||
"intermediateSize: int"})
|
|
||||||
.SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
|
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
|
|
||||||
.SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));
|
|
@@ -1,126 +0,0 @@
|
|||||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
#include <omp.h>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <iostream>
|
|
||||||
#include "paddle/extension.h"
|
|
||||||
|
|
||||||
void greedy_search(const float *probs,
|
|
||||||
int64_t *next_token_ids,
|
|
||||||
int bsz,
|
|
||||||
int vocab_size) {
|
|
||||||
int numThreads = 0;
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
int tid = omp_get_thread_num();
|
|
||||||
if (tid == 0) {
|
|
||||||
numThreads = omp_get_num_threads();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
float maxVals[bsz];
|
|
||||||
|
|
||||||
// Small batch size (each sample can have at least 2 threads)
|
|
||||||
if (numThreads / bsz >= 2) {
|
|
||||||
int thrPerSample = numThreads / bsz;
|
|
||||||
int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
|
|
||||||
int maxIndices[bsz * thrPerSample];
|
|
||||||
float maxValues[bsz * thrPerSample];
|
|
||||||
|
|
||||||
// TODO: if size is small, possible to cause out of boundary
|
|
||||||
#pragma omp parallel for collapse(2)
|
|
||||||
for (int b = 0; b < bsz; ++b) {
|
|
||||||
for (int t = 0; t < thrPerSample; ++t) {
|
|
||||||
int start = t * sizePerThr;
|
|
||||||
int end = (start + sizePerThr) > vocab_size
|
|
||||||
? vocab_size
|
|
||||||
: (start + sizePerThr);
|
|
||||||
const float *p = probs + b * vocab_size;
|
|
||||||
int maxIdx = start;
|
|
||||||
float maxVal = p[start];
|
|
||||||
for (int off = start + 1; off < end; ++off) {
|
|
||||||
if (p[off] > maxVal) {
|
|
||||||
maxVal = p[off];
|
|
||||||
maxIdx = off;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// False sharing happens, but since only one time, not avoided
|
|
||||||
maxIndices[b * thrPerSample + t] = maxIdx;
|
|
||||||
maxValues[b * thrPerSample + t] = maxVal;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Local reduction
|
|
||||||
for (int i = 0; i < bsz; ++i) {
|
|
||||||
int *pIndices = maxIndices + i * thrPerSample;
|
|
||||||
float *pValues = maxValues + i * thrPerSample;
|
|
||||||
int maxIdx = pIndices[0];
|
|
||||||
float maxVal = pValues[0];
|
|
||||||
for (int j = 1; j < thrPerSample; ++j) {
|
|
||||||
if (pValues[j] > maxVal) {
|
|
||||||
maxVal = pValues[j];
|
|
||||||
maxIdx = pIndices[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
next_token_ids[i] = maxIdx;
|
|
||||||
maxVals[i] = maxVal;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Each thread handle one sample (one row)
|
|
||||||
else {
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (int i = 0; i < bsz; ++i) {
|
|
||||||
int maxId = 0;
|
|
||||||
const float *p = probs + i * vocab_size;
|
|
||||||
float maxVal = p[0];
|
|
||||||
for (int j = 1; j < vocab_size; ++j) {
|
|
||||||
if (p[j] > maxVal) {
|
|
||||||
maxVal = p[j];
|
|
||||||
maxId = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
next_token_ids[i] = maxId;
|
|
||||||
maxVals[i] = maxVal;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
|
|
||||||
const int bsz = probs.shape()[0];
|
|
||||||
const int vocab_size = probs.shape()[1];
|
|
||||||
auto next_tokens =
|
|
||||||
paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
|
|
||||||
|
|
||||||
greedy_search(probs.data<float>(),
|
|
||||||
const_cast<int64_t *>(next_tokens.data<int64_t>()),
|
|
||||||
bsz,
|
|
||||||
vocab_size);
|
|
||||||
return {next_tokens};
|
|
||||||
}
|
|
||||||
std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
|
|
||||||
const std::vector<int64_t> &probs_shape) {
|
|
||||||
int64_t bsz = probs_shape[0];
|
|
||||||
return {{bsz, 1}};
|
|
||||||
}
|
|
||||||
std::vector<paddle::DataType> XftGreedySearchInferDtype(
|
|
||||||
const paddle::DataType &probs_dtype) {
|
|
||||||
return {paddle::DataType::INT64};
|
|
||||||
}
|
|
||||||
PD_BUILD_STATIC_OP(xft_greedy_search)
|
|
||||||
.Inputs({"probs"})
|
|
||||||
.Outputs({"next_tokens_ids"})
|
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
|
|
||||||
.SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
|
|
||||||
.SetKernelFn(PD_KERNEL(XftGreedySearch));
|
|
File diff suppressed because it is too large
Load Diff
@@ -17,15 +17,12 @@
|
|||||||
#include "paddle/phi/core/memory/memcpy.h"
|
#include "paddle/phi/core/memory/memcpy.h"
|
||||||
|
|
||||||
template <int THREADBLOCK_SIZE>
|
template <int THREADBLOCK_SIZE>
|
||||||
__global__ void GetMaxLenKernel(const int *seq_lens,
|
__global__ void
|
||||||
const int *seq_lens_this_time,
|
GetMaxLenKernel(const int *seq_lens, const int *seq_lens_this_time,
|
||||||
const int *seq_lens_encoder,
|
const int *seq_lens_encoder,
|
||||||
const int *seq_lens_this_time_merged,
|
const int *seq_lens_this_time_merged,
|
||||||
const int *seq_lens_encoder_merged,
|
const int *seq_lens_encoder_merged, const int *seq_mapping,
|
||||||
const int *seq_mapping,
|
const int *system_lens, int *max_lens, const int batch_size) {
|
||||||
const int *system_lens,
|
|
||||||
int *max_lens,
|
|
||||||
const int batch_size) {
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
||||||
@@ -41,43 +38,61 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
|
|||||||
int max_dec_len_without_system_this_thread = 0;
|
int max_dec_len_without_system_this_thread = 0;
|
||||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||||
const int seq_len_this_time = seq_lens_this_time[i];
|
const int seq_len_this_time = seq_lens_this_time[i];
|
||||||
max_len_this_time_this_thread = max(seq_len_this_time,
|
max_len_this_time_this_thread =
|
||||||
max_len_this_time_this_thread);
|
max(seq_len_this_time, max_len_this_time_this_thread);
|
||||||
max_len_encoder_this_thread = max(seq_lens_encoder[i],
|
max_len_encoder_this_thread =
|
||||||
max_len_encoder_this_thread);
|
max(seq_lens_encoder[i], max_len_encoder_this_thread);
|
||||||
max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
|
max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
|
||||||
if (seq_len_this_time <= 0) continue;
|
if (seq_len_this_time <= 0)
|
||||||
|
continue;
|
||||||
const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
|
const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
|
||||||
max_len_this_thread = max(seq_lens[i] + seq_len_this_time,
|
max_len_this_thread =
|
||||||
max_len_this_thread);
|
max(seq_lens[i] + seq_len_this_time, max_len_this_thread);
|
||||||
max_just_dec_len_this_thread = max(max_just_dec_len_this_thread,
|
max_just_dec_len_this_thread =
|
||||||
max_just_dec_len_now);
|
max(max_just_dec_len_this_thread, max_just_dec_len_now);
|
||||||
if (system_lens) {
|
if (system_lens) {
|
||||||
const int real_bid = seq_mapping[i];
|
const int real_bid = seq_mapping[i];
|
||||||
const int system_len_now = system_lens[real_bid];
|
const int system_len_now = system_lens[real_bid];
|
||||||
max_system_len_this_thread = max(max_system_len_this_thread, system_len_now);
|
max_system_len_this_thread =
|
||||||
max_dec_len_without_system_this_thread = max(max_dec_len_without_system_this_thread,
|
max(max_system_len_this_thread, system_len_now);
|
||||||
max_just_dec_len_now - system_len_now);
|
max_dec_len_without_system_this_thread =
|
||||||
|
max(max_dec_len_without_system_this_thread,
|
||||||
|
max_just_dec_len_now - system_len_now);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (system_lens) {
|
if (system_lens) {
|
||||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||||
const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
|
const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
|
||||||
if (ori_seq_len_this_time <= 0) continue;
|
if (ori_seq_len_this_time <= 0)
|
||||||
const int max_just_dec_merged_len_this_time_now = seq_lens_encoder_merged[i] > 0 ?
|
continue;
|
||||||
0 : ori_seq_len_this_time;
|
const int max_just_dec_merged_len_this_time_now =
|
||||||
max_just_dec_merged_len_this_time_this_thread = max(max_just_dec_merged_len_this_time_this_thread,
|
seq_lens_encoder_merged[i] > 0 ? 0 : ori_seq_len_this_time;
|
||||||
max_just_dec_merged_len_this_time_now);
|
max_just_dec_merged_len_this_time_this_thread =
|
||||||
|
max(max_just_dec_merged_len_this_time_this_thread,
|
||||||
|
max_just_dec_merged_len_this_time_now);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int total_max_len_this_time = BlockReduce(temp_storage).Reduce(max_len_this_time_this_thread, MaxOp<int>());
|
int total_max_len_this_time =
|
||||||
int total_max_len_encoder = BlockReduce(temp_storage).Reduce(max_len_encoder_this_thread, MaxOp<int>());
|
BlockReduce(temp_storage)
|
||||||
int total_max_len_decoder = BlockReduce(temp_storage).Reduce(max_len_decoder_this_thread, MaxOp<int>());
|
.Reduce(max_len_this_time_this_thread, MaxOp<int>());
|
||||||
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
int total_max_len_encoder =
|
||||||
int total_just_dec = BlockReduce(temp_storage).Reduce(max_just_dec_len_this_thread, MaxOp<int>());
|
BlockReduce(temp_storage)
|
||||||
int total_just_dec_merged = BlockReduce(temp_storage).Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
|
.Reduce(max_len_encoder_this_thread, MaxOp<int>());
|
||||||
int total_system_len = BlockReduce(temp_storage).Reduce(max_system_len_this_thread, MaxOp<int>());
|
int total_max_len_decoder =
|
||||||
int total_dec_len_without_system = BlockReduce(temp_storage).Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
|
BlockReduce(temp_storage)
|
||||||
|
.Reduce(max_len_decoder_this_thread, MaxOp<int>());
|
||||||
|
int total =
|
||||||
|
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||||
|
int total_just_dec = BlockReduce(temp_storage)
|
||||||
|
.Reduce(max_just_dec_len_this_thread, MaxOp<int>());
|
||||||
|
int total_just_dec_merged =
|
||||||
|
BlockReduce(temp_storage)
|
||||||
|
.Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
|
||||||
|
int total_system_len = BlockReduce(temp_storage)
|
||||||
|
.Reduce(max_system_len_this_thread, MaxOp<int>());
|
||||||
|
int total_dec_len_without_system =
|
||||||
|
BlockReduce(temp_storage)
|
||||||
|
.Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
max_lens[0] = total_max_len_this_time;
|
max_lens[0] = total_max_len_this_time;
|
||||||
max_lens[1] = total_max_len_encoder;
|
max_lens[1] = total_max_len_encoder;
|
||||||
@@ -90,30 +105,22 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetMaxLen(const paddle::Tensor& seq_lens_tensor,
|
void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
|
||||||
const paddle::Tensor& seq_lens_this_time,
|
const paddle::Tensor &seq_lens_this_time,
|
||||||
const paddle::Tensor& seq_lens_encoder,
|
const paddle::Tensor &seq_lens_encoder,
|
||||||
paddle::Tensor &max_len_tensor,
|
paddle::Tensor &max_len_tensor, const int batch_size) {
|
||||||
const int batch_size) {
|
|
||||||
constexpr int blockSize = 1024;
|
constexpr int blockSize = 1024;
|
||||||
GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
|
GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
|
||||||
seq_lens_tensor.data<int>(),
|
seq_lens_tensor.data<int>(), seq_lens_this_time.data<int>(),
|
||||||
seq_lens_this_time.data<int>(),
|
seq_lens_encoder.data<int>(), nullptr, nullptr, nullptr, nullptr,
|
||||||
seq_lens_encoder.data<int>(),
|
max_len_tensor.data<int>(), batch_size);
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
nullptr,
|
|
||||||
max_len_tensor.data<int>(),
|
|
||||||
batch_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
__global__ void split_q_block(const int *__restrict__ seq_lens_q,
|
||||||
const int* __restrict__ seq_lens_encoder,
|
const int *__restrict__ seq_lens_encoder,
|
||||||
int* __restrict__ batch_ids,
|
int *__restrict__ batch_ids,
|
||||||
int* __restrict__ tile_ids_per_batch,
|
int *__restrict__ tile_ids_per_batch,
|
||||||
int* __restrict__ num_blocks_x,
|
int *__restrict__ num_blocks_x, const int bsz,
|
||||||
const int bsz,
|
|
||||||
const int num_rows_per_block,
|
const int num_rows_per_block,
|
||||||
const int group_size) {
|
const int group_size) {
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
@@ -124,8 +131,7 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
|||||||
if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
|
if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
|
||||||
seq_len = 0;
|
seq_len = 0;
|
||||||
}
|
}
|
||||||
const int loop_times =
|
const int loop_times = div_up(seq_len * group_size, num_rows_per_block);
|
||||||
div_up(seq_len * group_size, num_rows_per_block);
|
|
||||||
for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
|
for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
|
||||||
batch_ids[index] = bid;
|
batch_ids[index] = bid;
|
||||||
tile_ids_per_batch[index++] = tile_id;
|
tile_ids_per_batch[index++] = tile_id;
|
||||||
@@ -136,14 +142,12 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
|
__global__ void split_kv_block(const int *__restrict__ seq_lens_decoder,
|
||||||
const int* __restrict__ seq_lens_encoder,
|
const int *__restrict__ seq_lens_encoder,
|
||||||
int* __restrict__ batch_ids,
|
int *__restrict__ batch_ids,
|
||||||
int* __restrict__ tile_ids_per_batch,
|
int *__restrict__ tile_ids_per_batch,
|
||||||
int* __restrict__ num_blocks_x,
|
int *__restrict__ num_blocks_x, const int bsz,
|
||||||
const int bsz,
|
const int pad_len, const int num_row_per_block) {
|
||||||
const int pad_len,
|
|
||||||
const int num_row_per_block) {
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
int gridx = 0;
|
int gridx = 0;
|
||||||
int index = 0;
|
int index = 0;
|
||||||
@@ -165,50 +169,46 @@ __global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int THREADBLOCK_SIZE>
|
template <int THREADBLOCK_SIZE>
|
||||||
__global__ void get_max_len_kv_ernel(int* max_seq_lens_out,
|
__global__ void
|
||||||
const int* seq_lens_this_time,
|
get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
|
||||||
const int* seq_lens_decoder,
|
const int *seq_lens_decoder, const int batch_size) {
|
||||||
const int batch_size) {
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
|
||||||
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
||||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||||
|
|
||||||
int max_len_this_thread = 0;
|
int max_len_this_thread = 0;
|
||||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||||
if (seq_lens_decoder[i] == 0) continue;
|
if (seq_lens_decoder[i] == 0)
|
||||||
max_len_this_thread = max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
|
continue;
|
||||||
|
max_len_this_thread =
|
||||||
|
max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
|
||||||
}
|
}
|
||||||
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
int total =
|
||||||
|
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||||
if (tid == 0) {
|
if (tid == 0) {
|
||||||
*max_seq_lens_out = total;
|
*max_seq_lens_out = total;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
||||||
const paddle::Tensor& seq_lens_encoder,
|
const paddle::Tensor &seq_lens_encoder,
|
||||||
const paddle::Tensor& seq_lens_decoder,
|
const paddle::Tensor &seq_lens_decoder,
|
||||||
const paddle::Tensor& seq_lens_this_time,
|
const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
|
||||||
const paddle::Tensor& cum_offsets,
|
const int encoder_block_shape_q, const int decoder_block_shape_q,
|
||||||
const int encoder_block_shape_q,
|
const int group_size, const int block_size,
|
||||||
const int decoder_block_shape_q,
|
|
||||||
const int group_size,
|
|
||||||
const int block_size,
|
|
||||||
const int decoder_step_token_num) {
|
const int decoder_step_token_num) {
|
||||||
auto stream = seq_lens_encoder.stream();
|
auto stream = seq_lens_encoder.stream();
|
||||||
int bsz = cum_offsets.shape()[0];
|
int bsz = cum_offsets.shape()[0];
|
||||||
auto max_len_tensor =
|
auto max_len_tensor =
|
||||||
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
|
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
GetMaxLen(
|
GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
|
||||||
seq_lens_decoder,
|
max_len_tensor, bsz);
|
||||||
seq_lens_this_time,
|
|
||||||
seq_lens_encoder,
|
|
||||||
max_len_tensor,
|
|
||||||
bsz);
|
|
||||||
|
|
||||||
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time, max_enc_dec_len_this_time,
|
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
|
||||||
// max_just_dec_len_this_time, max_just_dec_merged_len_this_time, max_system_len, max_just_dec_len_without_system
|
// max_enc_dec_len_this_time, max_just_dec_len_this_time,
|
||||||
|
// max_just_dec_merged_len_this_time, max_system_len,
|
||||||
|
// max_just_dec_len_without_system
|
||||||
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
|
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
|
||||||
auto max_len_cpu_ptr = max_len_cpu.data<int>();
|
auto max_len_cpu_ptr = max_len_cpu.data<int>();
|
||||||
int max_len_this_time = max_len_cpu_ptr[0];
|
int max_len_this_time = max_len_cpu_ptr[0];
|
||||||
@@ -229,67 +229,67 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
|||||||
paddle::Tensor decoder_batch_ids;
|
paddle::Tensor decoder_batch_ids;
|
||||||
paddle::Tensor decoder_tile_ids_per_batch;
|
paddle::Tensor decoder_tile_ids_per_batch;
|
||||||
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
|
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
|
||||||
paddle::Tensor max_len_kv_cpu; /*cpu*/
|
paddle::Tensor max_len_kv_cpu; /*cpu*/
|
||||||
|
|
||||||
auto max_len_kv =
|
auto max_len_kv =
|
||||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
|
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
|
||||||
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
|
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
|
||||||
max_len_kv.data<int>(),
|
max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
|
||||||
seq_lens_this_time.data<int>(),
|
seq_lens_decoder.data<int>(), bsz);
|
||||||
seq_lens_decoder.data<int>(),
|
|
||||||
bsz
|
|
||||||
);
|
|
||||||
|
|
||||||
max_len_kv_cpu =
|
max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
|
||||||
max_len_kv.copy_to(paddle::CPUPlace(), false);
|
|
||||||
|
|
||||||
if (max_enc_len_this_time > 0) {
|
if (max_enc_len_this_time > 0) {
|
||||||
const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
|
const uint32_t max_tile_size_per_bs_kv =
|
||||||
kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
|
div_up(max_enc_dec_len_this_time, block_size);
|
||||||
paddle::DataType::INT32,
|
kv_batch_ids =
|
||||||
seq_lens_encoder.place());
|
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
|
||||||
kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
|
seq_lens_encoder.place());
|
||||||
paddle::DataType::INT32,
|
kv_tile_ids_per_batch =
|
||||||
seq_lens_encoder.place());
|
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
|
||||||
|
seq_lens_encoder.place());
|
||||||
auto kv_num_blocks_x =
|
auto kv_num_blocks_x =
|
||||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
|
||||||
split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
|
split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
|
||||||
seq_lens_decoder.data<int>(),
|
seq_lens_decoder.data<int>(),
|
||||||
// sequence_lengths->data<int>(),
|
// sequence_lengths->data<int>(),
|
||||||
seq_lens_encoder.data<int>(),
|
seq_lens_encoder.data<int>(), kv_batch_ids.data<int>(),
|
||||||
kv_batch_ids.data<int>(),
|
kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
|
||||||
kv_tile_ids_per_batch.data<int>(),
|
block_size, block_size);
|
||||||
kv_num_blocks_x.data<int>(),
|
|
||||||
bsz,
|
|
||||||
block_size,
|
|
||||||
block_size
|
|
||||||
);
|
|
||||||
|
|
||||||
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||||
|
|
||||||
const uint32_t encoder_max_tile_size_per_bs_q = div_up(
|
const uint32_t encoder_max_tile_size_per_bs_q =
|
||||||
(max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
||||||
encoder_batch_ids =
|
encoder_batch_ids =
|
||||||
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
seq_lens_encoder.place());
|
|
||||||
encoder_tile_ids_per_batch =
|
encoder_tile_ids_per_batch =
|
||||||
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
seq_lens_encoder.place());
|
|
||||||
auto encoder_num_blocks_x =
|
auto encoder_num_blocks_x =
|
||||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
|
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
|
||||||
nullptr,
|
|
||||||
encoder_batch_ids.data<int>(),
|
encoder_batch_ids.data<int>(),
|
||||||
encoder_tile_ids_per_batch.data<int>(),
|
encoder_tile_ids_per_batch.data<int>(),
|
||||||
encoder_num_blocks_x.data<int>(),
|
encoder_num_blocks_x.data<int>(), bsz,
|
||||||
bsz,
|
encoder_block_shape_q, group_size);
|
||||||
encoder_block_shape_q,
|
|
||||||
group_size);
|
|
||||||
encoder_num_blocks_x_cpu =
|
encoder_num_blocks_x_cpu =
|
||||||
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||||
|
} else {
|
||||||
|
encoder_batch_ids =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
encoder_tile_ids_per_batch =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
encoder_num_blocks_x_cpu =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
|
||||||
|
kv_batch_ids =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
kv_tile_ids_per_batch =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
kv_num_blocks_x_cpu =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
}
|
}
|
||||||
if (max_just_dec_len_this_time > 0) {
|
if (max_just_dec_len_this_time > 0) {
|
||||||
const uint32_t decoder_max_tile_size_per_bs_q =
|
const uint32_t decoder_max_tile_size_per_bs_q =
|
||||||
@@ -297,24 +297,26 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
|||||||
|
|
||||||
decoder_batch_ids =
|
decoder_batch_ids =
|
||||||
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
seq_lens_encoder.place());
|
|
||||||
decoder_tile_ids_per_batch =
|
decoder_tile_ids_per_batch =
|
||||||
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
seq_lens_encoder.place());
|
|
||||||
auto decoder_num_blocks_x =
|
auto decoder_num_blocks_x =
|
||||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
|
split_q_block<<<1, 32, 0, stream>>>(
|
||||||
seq_lens_encoder.data<int>(),
|
seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
|
||||||
decoder_batch_ids.data<int>(),
|
decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
|
||||||
decoder_tile_ids_per_batch.data<int>(),
|
decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
|
||||||
decoder_num_blocks_x.data<int>(),
|
group_size);
|
||||||
bsz,
|
|
||||||
decoder_block_shape_q,
|
|
||||||
group_size);
|
|
||||||
decoder_num_blocks_x_cpu =
|
decoder_num_blocks_x_cpu =
|
||||||
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||||
|
} else {
|
||||||
|
decoder_batch_ids =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
decoder_tile_ids_per_batch =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||||
|
decoder_num_blocks_x_cpu =
|
||||||
|
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
|
||||||
}
|
}
|
||||||
|
|
||||||
return {encoder_batch_ids,
|
return {encoder_batch_ids,
|
||||||
@@ -331,28 +333,22 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
|
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
|
||||||
const paddle::DataType& seq_lens_encoder_dtype,
|
const paddle::DataType &seq_lens_encoder_dtype,
|
||||||
const paddle::DataType& seq_lens_decoder_dtype,
|
const paddle::DataType &seq_lens_decoder_dtype,
|
||||||
const paddle::DataType& seq_lens_this_time_dtype,
|
const paddle::DataType &seq_lens_this_time_dtype,
|
||||||
const paddle::DataType& cum_offsets_dtype) {
|
const paddle::DataType &cum_offsets_dtype) {
|
||||||
return {paddle::DataType::INT32,
|
return {
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||||
paddle::DataType::INT32,
|
paddle::DataType::INT32, paddle::DataType::INT32};
|
||||||
paddle::DataType::INT32,
|
|
||||||
paddle::DataType::INT32,
|
|
||||||
paddle::DataType::INT32,
|
|
||||||
paddle::DataType::INT32,
|
|
||||||
paddle::DataType::INT32,
|
|
||||||
paddle::DataType::INT32};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
|
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
|
||||||
const std::vector<int64_t>& seq_lens_encoder_shape,
|
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||||
const std::vector<int64_t>& seq_lens_decoder_shape,
|
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||||
const std::vector<int64_t>& seq_lens_this_time_shape,
|
const std::vector<int64_t> &seq_lens_this_time_shape,
|
||||||
const std::vector<int64_t>& cum_offsets_shape) {
|
const std::vector<int64_t> &cum_offsets_shape) {
|
||||||
std::vector<int64_t> dynamic_shape = {-1};
|
std::vector<int64_t> dynamic_shape = {-1};
|
||||||
|
|
||||||
return {dynamic_shape,
|
return {dynamic_shape,
|
||||||
@@ -369,9 +365,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
|
|||||||
}
|
}
|
||||||
|
|
||||||
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
|
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
|
||||||
.Inputs({"seq_lens_encoder",
|
.Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
|
||||||
"seq_lens_decoder",
|
|
||||||
"seq_lens_this_time",
|
|
||||||
"cum_offsets"})
|
"cum_offsets"})
|
||||||
.Outputs({paddle::Optional("encoder_batch_ids"),
|
.Outputs({paddle::Optional("encoder_batch_ids"),
|
||||||
paddle::Optional("encoder_tile_ids_per_batch"),
|
paddle::Optional("encoder_tile_ids_per_batch"),
|
||||||
@@ -382,12 +376,9 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
|
|||||||
paddle::Optional("decoder_batch_ids"),
|
paddle::Optional("decoder_batch_ids"),
|
||||||
paddle::Optional("decoder_tile_ids_per_batch"),
|
paddle::Optional("decoder_tile_ids_per_batch"),
|
||||||
paddle::Optional("decoder_num_blocks"),
|
paddle::Optional("decoder_num_blocks"),
|
||||||
paddle::Optional("max_len_kv"),
|
paddle::Optional("max_len_kv"), "set_max_lengths"})
|
||||||
"set_max_lengths"})
|
.Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
|
||||||
.Attrs({"encoder_block_shape_q: int",
|
"group_size: int", "block_size: int",
|
||||||
"decoder_block_shape_q: int",
|
|
||||||
"group_size: int",
|
|
||||||
"block_size: int",
|
|
||||||
"decoder_step_token_num: int"})
|
"decoder_step_token_num: int"})
|
||||||
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
|
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
|
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
|
||||||
|
@@ -337,6 +337,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
|||||||
} else if (deal_each_time == 64) { \
|
} else if (deal_each_time == 64) { \
|
||||||
constexpr size_t DEAL_EACH_TIME = 64; \
|
constexpr size_t DEAL_EACH_TIME = 64; \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
|
} else { \
|
||||||
|
PD_THROW("not support the deal_each_time", deal_each_time); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
|
#define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
|
||||||
@@ -346,6 +348,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
|||||||
} else if (num_threads == 256) { \
|
} else if (num_threads == 256) { \
|
||||||
constexpr size_t NUM_THREADS = 256; \
|
constexpr size_t NUM_THREADS = 256; \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
|
} else { \
|
||||||
|
PD_THROW("not support the num_threads", num_threads); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
|
#define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
|
||||||
@@ -376,6 +380,11 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
|||||||
} else if (group_size == 12) { \
|
} else if (group_size == 12) { \
|
||||||
constexpr size_t GROUP_SIZE = 12; \
|
constexpr size_t GROUP_SIZE = 12; \
|
||||||
__VA_ARGS__ \
|
__VA_ARGS__ \
|
||||||
|
} else if (group_size == 16) { \
|
||||||
|
constexpr size_t GROUP_SIZE = 16; \
|
||||||
|
__VA_ARGS__ \
|
||||||
|
} else { \
|
||||||
|
PD_THROW("not support the group_size", group_size); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \
|
#define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \
|
||||||
|
@@ -13,7 +13,7 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
#include "paddle/extension.h"
|
#include "paddle/extension.h"
|
||||||
|
#include "pybind11/pybind11.h"
|
||||||
namespace py = pybind11;
|
namespace py = pybind11;
|
||||||
|
|
||||||
// 自定义异常类,用于处理CUDA错误
|
// 自定义异常类,用于处理CUDA错误
|
||||||
@@ -125,45 +125,40 @@ paddle::Tensor FusedExpertMoeFunc(
|
|||||||
const bool norm_topk_prob, const bool group_moe);
|
const bool norm_topk_prob, const bool group_moe);
|
||||||
|
|
||||||
std::vector<paddle::Tensor> MoeExpertDispatch(
|
std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||||
const paddle::Tensor& input,
|
const paddle::Tensor &input, const paddle::Tensor &gating_output,
|
||||||
const paddle::Tensor& gating_output,
|
const paddle::optional<paddle::Tensor> &gating_correction_bias,
|
||||||
const paddle::optional<paddle::Tensor>& gating_correction_bias,
|
const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
|
||||||
const paddle::optional<paddle::Tensor> &w4a8_in_scale,
|
const bool group_moe, const bool topk_only_mode);
|
||||||
const int moe_topk,
|
|
||||||
const bool group_moe,
|
|
||||||
const bool topk_only_mode);
|
|
||||||
|
|
||||||
std::vector<paddle::Tensor>
|
std::vector<paddle::Tensor>
|
||||||
MoETopKSelectKernel(const paddle::Tensor &gating_logits,
|
MoETopKSelectKernel(const paddle::Tensor &gating_logits,
|
||||||
const paddle::optional<paddle::Tensor> &bias,
|
const paddle::optional<paddle::Tensor> &bias,
|
||||||
const int moe_topk, const bool apply_norm_weight,
|
const int moe_topk, const bool apply_norm_weight,
|
||||||
const bool enable_softmax_top_k_fused);
|
const bool enable_softmax_top_k_fused);
|
||||||
|
|
||||||
std::vector<paddle::Tensor> MoERedundantTopKSelectKernel(
|
std::vector<paddle::Tensor>
|
||||||
const paddle::Tensor& gating_logits,
|
MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
|
||||||
const paddle::Tensor& expert_id_to_ep_rank_array,
|
const paddle::Tensor &expert_id_to_ep_rank_array,
|
||||||
const paddle::Tensor& expert_in_rank_num_list,
|
const paddle::Tensor &expert_in_rank_num_list,
|
||||||
paddle::Tensor& tokens_per_expert_stats_list,
|
paddle::Tensor &tokens_per_expert_stats_list,
|
||||||
const paddle::optional<paddle::Tensor>& bias,
|
const paddle::optional<paddle::Tensor> &bias,
|
||||||
const int moe_topk,
|
const int moe_topk, const bool apply_norm_weight,
|
||||||
const bool apply_norm_weight,
|
const bool enable_softmax_top_k_fused,
|
||||||
const bool enable_softmax_top_k_fused,
|
const int redundant_ep_rank_num_plus_one);
|
||||||
const int redundant_ep_rank_num_plus_one);
|
|
||||||
|
|
||||||
std::vector<paddle::Tensor>
|
std::vector<paddle::Tensor>
|
||||||
EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
|
EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
|
||||||
const paddle::Tensor &topk_weights,
|
const paddle::Tensor &topk_weights,
|
||||||
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
|
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
|
||||||
const std::vector<int> &token_nums_per_expert,
|
const std::vector<int> &token_nums_per_expert,
|
||||||
const int token_nums_this_rank,
|
const int token_nums_this_rank,
|
||||||
const std::string &moe_quant_type);
|
const std::string &moe_quant_type);
|
||||||
|
|
||||||
std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
|
std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
|
||||||
const paddle::Tensor &input, const paddle::Tensor &scale,
|
const paddle::Tensor &input, const paddle::Tensor &scale,
|
||||||
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
|
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
|
||||||
const std::vector<int> &token_nums_per_expert,
|
const paddle::Tensor &token_nums_per_expert,
|
||||||
const std::vector<int> &token_nums_per_expert_padded,
|
const paddle::Tensor &token_nums_per_expert_padded);
|
||||||
const int token_nums_this_rank, const int token_nums_this_rank_padded);
|
|
||||||
|
|
||||||
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
|
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
|
||||||
const int block_size);
|
const int block_size);
|
||||||
@@ -180,20 +175,35 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
|
|||||||
const paddle::optional<paddle::Tensor> &ffn2_bias,
|
const paddle::optional<paddle::Tensor> &ffn2_bias,
|
||||||
const bool norm_topk_prob, const float routed_scaling_factor);
|
const bool norm_topk_prob, const float routed_scaling_factor);
|
||||||
|
|
||||||
std::vector<std::vector<int>> GetExpertTokenNum(
|
std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
|
||||||
const paddle::Tensor& topk_ids,
|
const int num_experts);
|
||||||
const int num_experts);
|
|
||||||
|
|
||||||
paddle::Tensor MoeExpertFFNFunc(
|
paddle::Tensor MoeExpertFFNFunc(
|
||||||
const paddle::Tensor &permute_input,
|
const paddle::Tensor& permute_input,
|
||||||
const paddle::Tensor &tokens_expert_prefix_sum,
|
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||||
const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
|
const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
|
||||||
const paddle::optional<paddle::Tensor> &ffn1_bias,
|
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||||
const paddle::optional<paddle::Tensor> &ffn1_scale,
|
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||||
const paddle::optional<paddle::Tensor> &ffn2_scale,
|
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||||
const paddle::optional<paddle::Tensor> &ffn2_in_scale,
|
const paddle::optional<paddle::Tensor>& ffn2_in_scale,
|
||||||
const paddle::optional<paddle::Tensor> &expert_idx_per_token,
|
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
|
||||||
const std::string &quant_method, const bool used_in_ep_low_latency);
|
const std::string& quant_method, const bool used_in_ep_low_latency);
|
||||||
|
|
||||||
|
paddle::Tensor MoeExpertFFNWint2Func(
|
||||||
|
const paddle::Tensor& permute_input,
|
||||||
|
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||||
|
const paddle::Tensor& ffn1_weight,
|
||||||
|
const paddle::Tensor& ffn2_weight,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn1_local_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn1_code_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn1_code_zp,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn2_local_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn2_code_scale,
|
||||||
|
const paddle::optional<paddle::Tensor>& ffn2_code_zp,
|
||||||
|
const bool used_in_ep_low_latency);
|
||||||
|
|
||||||
paddle::Tensor MoeExpertReduceFunc(
|
paddle::Tensor MoeExpertReduceFunc(
|
||||||
const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
|
const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
|
||||||
@@ -205,19 +215,16 @@ paddle::Tensor MoeExpertReduceFunc(
|
|||||||
void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
|
void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
|
||||||
const paddle::Tensor &seq_lens_this_time_tensor,
|
const paddle::Tensor &seq_lens_this_time_tensor,
|
||||||
const paddle::Tensor &seq_lens_decoder_tensor,
|
const paddle::Tensor &seq_lens_decoder_tensor,
|
||||||
const int rank,
|
const int rank, const int num_layers);
|
||||||
const int num_layers);
|
|
||||||
|
|
||||||
void GetOutputKVSignal(const paddle::Tensor& x,
|
void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id,
|
||||||
int64_t rank_id,
|
|
||||||
bool wait_flag);
|
bool wait_flag);
|
||||||
|
|
||||||
|
|
||||||
paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
|
paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
|
||||||
const paddle::Tensor &out_scale,
|
const paddle::Tensor &out_scale,
|
||||||
std::string dtype);
|
std::string dtype);
|
||||||
|
|
||||||
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
|
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
|
||||||
const bool keep_pd_step_flag);
|
const bool keep_pd_step_flag);
|
||||||
|
|
||||||
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
|
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
|
||||||
@@ -286,61 +293,121 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
|
|||||||
const paddle::Tensor &seq_lens_this_time,
|
const paddle::Tensor &seq_lens_this_time,
|
||||||
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
|
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
|
||||||
|
|
||||||
std::vector<paddle::Tensor> MoEDeepGEMMPermute(
|
std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
|
||||||
const paddle::Tensor& x,
|
const paddle::Tensor &topk_idx,
|
||||||
const paddle::Tensor& topk_idx,
|
const int num_experts,
|
||||||
const int num_experts,
|
const int max_tokens_per_expert);
|
||||||
const int max_tokens_per_expert
|
|
||||||
);
|
|
||||||
|
|
||||||
std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
|
std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
|
||||||
const paddle::Tensor& ffn_out, // [num_experts, max_tokens_per_expert, hidden]
|
const paddle::Tensor
|
||||||
const paddle::Tensor& permute_indices_per_token, // [token_num, topk}]
|
&ffn_out, // [num_experts, max_tokens_per_expert, hidden]
|
||||||
const paddle::Tensor& topk_idx,
|
const paddle::Tensor &permute_indices_per_token, // [token_num, topk}]
|
||||||
const paddle::Tensor& topk_weights
|
const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
|
||||||
);
|
|
||||||
|
void TextImageIndexOut(const paddle::Tensor &token_type_ids,
|
||||||
|
const paddle::Tensor &text_input,
|
||||||
|
const paddle::Tensor &image_input);
|
||||||
|
|
||||||
|
void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
|
||||||
|
paddle::Tensor &image_input,
|
||||||
|
paddle::Tensor &token_type_ids,
|
||||||
|
paddle::Tensor &text_index,
|
||||||
|
paddle::Tensor &image_index, const bool is_scatter);
|
||||||
|
|
||||||
|
paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
|
||||||
|
int64_t num_experts);
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M);
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> MoeWna16MarlinGemmApi(
|
||||||
|
const paddle::Tensor& a,
|
||||||
|
const paddle::optional<paddle::Tensor>& c_or_none,
|
||||||
|
const paddle::Tensor& b_q_weight,
|
||||||
|
const paddle::Tensor& b_scales,
|
||||||
|
const paddle::optional<paddle::Tensor>& global_scale_or_none,
|
||||||
|
const paddle::optional<paddle::Tensor>& b_zeros_or_none,
|
||||||
|
const paddle::optional<paddle::Tensor>& g_idx_or_none,
|
||||||
|
const paddle::optional<paddle::Tensor>& perm_or_none,
|
||||||
|
const paddle::Tensor& workspace,
|
||||||
|
const paddle::Tensor& sorted_token_ids,
|
||||||
|
const paddle::Tensor& expert_ids,
|
||||||
|
const paddle::Tensor& num_tokens_post_padded,
|
||||||
|
const paddle::Tensor& topk_weights,
|
||||||
|
int64_t moe_block_size,
|
||||||
|
int64_t top_k,
|
||||||
|
bool mul_topk_weights,
|
||||||
|
bool is_ep,
|
||||||
|
const std::string& b_q_type_str,
|
||||||
|
int64_t size_m,
|
||||||
|
int64_t size_n,
|
||||||
|
int64_t size_k,
|
||||||
|
bool is_k_full,
|
||||||
|
bool use_atomic_add,
|
||||||
|
bool use_fp32_reduce,
|
||||||
|
bool is_zp_float);
|
||||||
|
void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a,
|
||||||
|
paddle::Tensor const &b, paddle::Tensor const &a_scales,
|
||||||
|
paddle::Tensor const &b_scales,
|
||||||
|
paddle::optional<paddle::Tensor> const &bias);
|
||||||
|
|
||||||
|
void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a,
|
||||||
|
paddle::Tensor const& b,
|
||||||
|
paddle::Tensor const& a_scales,
|
||||||
|
paddle::Tensor const& b_scales,
|
||||||
|
paddle::Tensor const& azp_adj,
|
||||||
|
paddle::optional<paddle::Tensor> const& azp,
|
||||||
|
paddle::optional<paddle::Tensor> const& bias);
|
||||||
|
|
||||||
|
void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
|
||||||
|
paddle::Tensor const &scale);
|
||||||
|
|
||||||
|
void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
|
||||||
|
paddle::Tensor &scale);
|
||||||
|
|
||||||
|
void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out,
|
||||||
|
paddle::Tensor const &input,
|
||||||
|
paddle::Tensor &scales, float scale_ub);
|
||||||
|
|
||||||
PYBIND11_MODULE(fastdeploy_ops, m) {
|
PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||||
|
|
||||||
m.def("get_expert_token_num", &GetExpertTokenNum,
|
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
|
||||||
py::arg("topk_ids"), py::arg("num_experts"),
|
py::arg("num_experts"), "get expert token num");
|
||||||
"get expert token num");
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* moe/fused_moe/moe_redundant_topk_select.cu
|
||||||
|
* moe_redundant_topk_select
|
||||||
|
*/
|
||||||
|
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
|
||||||
|
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
|
||||||
|
py::arg("expert_in_rank_num_list"),
|
||||||
|
py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
|
||||||
|
py::arg("moe_topk"), py::arg("apply_norm_weight"),
|
||||||
|
py::arg("enable_softmax_top_k_fused"),
|
||||||
|
py::arg("redundant_ep_rank_num_plus_one"),
|
||||||
|
"moe export RedundantTopKSelect function");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* moe/fused_moe/moe_redundant_topk_select.cu
|
* open_shm_and_get_meta_signal.cc
|
||||||
* moe_redundant_topk_select
|
* InitKVSignalPerQuery
|
||||||
*/
|
*/
|
||||||
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
|
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
|
||||||
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
|
py::arg("seq_lens_encoder_tensor"),
|
||||||
py::arg("expert_in_rank_num_list"), py::arg("tokens_per_expert_stats_list"),
|
py::arg("seq_lens_this_time_tensor"),
|
||||||
py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"),
|
py::arg("seq_lens_decoder_tensor"), py::arg("rank"),
|
||||||
py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"),
|
py::arg("num_layers"), "init_kv_signal_per_query function");
|
||||||
"moe export RedundantTopKSelect function");
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GetOutputKVSignal
|
||||||
|
*/
|
||||||
|
m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"),
|
||||||
|
py::arg("rank_id"), py::arg("wait_flag"),
|
||||||
|
"get_output_kv_signal function");
|
||||||
|
|
||||||
/**
|
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
|
||||||
* open_shm_and_get_meta_signal.cc
|
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute,
|
||||||
* InitKVSingnalPerQuery
|
"MoEDeepGEMMDePermute");
|
||||||
*/
|
|
||||||
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
|
|
||||||
py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"),
|
|
||||||
py::arg("seq_lens_decoder_tensor"), py::arg("rank"), py::arg("num_layers"),
|
|
||||||
"init_kv_signal_per_query function");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* GetOutputKVSignal
|
|
||||||
*/
|
|
||||||
m.def("get_output_kv_signal", &GetOutputKVSignal,
|
|
||||||
py::arg("x"), py::arg("rank_id"), py::arg("wait_flag"),
|
|
||||||
"get_output_kv_signal function");
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
|
|
||||||
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
|
|
||||||
/**
|
/**
|
||||||
* alloc_cache_pinned.cc
|
* alloc_cache_pinned.cc
|
||||||
* cuda_host_alloc
|
* cuda_host_alloc
|
||||||
@@ -398,12 +465,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
|
py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
|
||||||
py::arg("moe_quant_type"), "ep moe export dispatch function");
|
py::arg("moe_quant_type"), "ep moe export dispatch function");
|
||||||
|
|
||||||
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8, py::arg("input"),
|
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8);
|
||||||
py::arg("scale"), py::arg("topk_ids"), py::arg("topk_weights"),
|
|
||||||
py::arg("token_nums_per_expert"),
|
|
||||||
py::arg("token_nums_per_expert_padded"),
|
|
||||||
py::arg("token_nums_this_rank"), py::arg("token_nums_this_rank_padded"),
|
|
||||||
"ep moe export dispatch function");
|
|
||||||
|
|
||||||
m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
|
m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
|
||||||
py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
|
py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
|
||||||
@@ -437,6 +499,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
*/
|
*/
|
||||||
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
|
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* moe/fused_moe/moe_ffn_wint2.cu
|
||||||
|
* moe_expert_ffn_wint2
|
||||||
|
*/
|
||||||
|
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* moe/fused_moe/moe_expert_reduce.cu
|
* moe/fused_moe/moe_expert_reduce.cu
|
||||||
* moe_expert_reduce
|
* moe_expert_reduce
|
||||||
@@ -523,4 +591,66 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
|
|
||||||
m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
|
m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
|
||||||
"group_swiglu_with_masked function");
|
"group_swiglu_with_masked function");
|
||||||
|
|
||||||
|
m.def("text_image_index_out", &TextImageIndexOut,
|
||||||
|
"text_image_index_out function");
|
||||||
|
|
||||||
|
m.def("text_image_gather_scatter", &TextImageGatherScatter,
|
||||||
|
"text_image_gather_scatter function");
|
||||||
|
|
||||||
|
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
|
||||||
|
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
|
||||||
|
|
||||||
|
m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
|
||||||
|
py::arg("a"),
|
||||||
|
py::arg("c_or_none"),
|
||||||
|
py::arg("b_q_weight"),
|
||||||
|
py::arg("b_scales"),
|
||||||
|
py::arg("global_scale_or_none"),
|
||||||
|
py::arg("b_zeros_or_none"),
|
||||||
|
py::arg("g_idx_or_none"),
|
||||||
|
py::arg("perm_or_none"),
|
||||||
|
py::arg("workspace"),
|
||||||
|
py::arg("sorted_token_ids"),
|
||||||
|
py::arg("expert_ids"),
|
||||||
|
py::arg("num_tokens_post_padded"),
|
||||||
|
py::arg("topk_weights"),
|
||||||
|
py::arg("moe_block_size"),
|
||||||
|
py::arg("top_k"),
|
||||||
|
py::arg("mul_topk_weights"),
|
||||||
|
py::arg("is_ep"),
|
||||||
|
py::arg("b_q_type_str"),
|
||||||
|
py::arg("size_m"),
|
||||||
|
py::arg("size_n"),
|
||||||
|
py::arg("size_k"),
|
||||||
|
py::arg("is_k_full"),
|
||||||
|
py::arg("use_atomic_add"),
|
||||||
|
py::arg("use_fp32_reduce"),
|
||||||
|
py::arg("is_zp_float"));
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cutlass_scaled_mm.cu
|
||||||
|
* cutlass_scaled_mm
|
||||||
|
* cutlass_scaled_mm_azp
|
||||||
|
*/
|
||||||
|
m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function");
|
||||||
|
m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* quantization/common.cu
|
||||||
|
* static_scaled_fp8_quant
|
||||||
|
* dynamic_scaled_fp8_quant
|
||||||
|
* dynamic_per_token_scaled_fp8_quant
|
||||||
|
*/
|
||||||
|
m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function",
|
||||||
|
py::arg("out"), py::arg("input"), py::arg("scale"));
|
||||||
|
|
||||||
|
m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant,
|
||||||
|
"dynamic_scaled_fp8_quant function",
|
||||||
|
py::arg("out"), py::arg("input"), py::arg("scale"));
|
||||||
|
|
||||||
|
m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
|
||||||
|
"dynamic_per_token_scaled_fp8_quant function",
|
||||||
|
py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
|
||||||
}
|
}
|
250
custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
Normal file
250
custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
/***************************************************************************************************
|
||||||
|
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||||
|
* SPDX-License-Identifier: BSD-3-Clause
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
**************************************************************************************************/
|
||||||
|
|
||||||
|
/*! \file
|
||||||
|
\brief Architecture-specific operators on memory added for SM80
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "cutlass/cutlass.h"
|
||||||
|
#include "cutlass/complex.h"
|
||||||
|
#include "cutlass/arch/memory.h"
|
||||||
|
#include "cutlass/arch/memory_sm75.h"
|
||||||
|
#include "cutlass/arch/memory_sm80.h"
|
||||||
|
#include "cutlass/arch/cache_operation.h"
|
||||||
|
|
||||||
|
namespace cutlass {
|
||||||
|
namespace arch {
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/// Initiates an asynchronous copy from global memory to shared memory.
|
||||||
|
///
|
||||||
|
/// cp.async
|
||||||
|
///
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes,
|
||||||
|
/// Cache operation
|
||||||
|
CacheOperation::Kind cache_op = CacheOperation::Always,
|
||||||
|
bool GlobalToShared = true>
|
||||||
|
struct copy;
|
||||||
|
|
||||||
|
/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
|
||||||
|
/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
|
||||||
|
///
|
||||||
|
/// cp.async
|
||||||
|
///
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes,
|
||||||
|
/// Cache operation
|
||||||
|
CacheOperation::Kind cache_op = CacheOperation::Always,
|
||||||
|
bool GlobalToShared = true>
|
||||||
|
struct copy_zfill;
|
||||||
|
|
||||||
|
/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
|
||||||
|
///
|
||||||
|
/// cp.async
|
||||||
|
///
|
||||||
|
template <int N, bool GlobalToShared = true>
|
||||||
|
struct copy_wait;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy<SizeInBytes, CacheOperation::Always, true> {
|
||||||
|
|
||||||
|
/// Copy
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
cp_async<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy<SizeInBytes, CacheOperation::Always, false> {
|
||||||
|
|
||||||
|
/// Copy
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||||
|
|
||||||
|
if (pred_guard) {
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy_zfill<SizeInBytes, CacheOperation::Always, true> {
|
||||||
|
|
||||||
|
/// Copy with zero fill
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
|
||||||
|
cp_async_zfill<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy_zfill<SizeInBytes, CacheOperation::Always, false> {
|
||||||
|
|
||||||
|
/// Copy with zero fill
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
|
||||||
|
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||||
|
|
||||||
|
if (pred_guard) {
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
AccessType zeros;
|
||||||
|
zeros.clear();
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = zeros;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy<SizeInBytes, CacheOperation::Global, true> {
|
||||||
|
|
||||||
|
/// Copy
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
cp_async<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy<SizeInBytes, CacheOperation::Global, false> {
|
||||||
|
|
||||||
|
/// Copy
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||||
|
|
||||||
|
if (pred_guard) {
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy_zfill<SizeInBytes, CacheOperation::Global, true> {
|
||||||
|
|
||||||
|
/// Copy with zero fill
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
cp_async_zfill<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <
|
||||||
|
/// Size of the access in bytes
|
||||||
|
int SizeInBytes>
|
||||||
|
struct copy_zfill<SizeInBytes, CacheOperation::Global, false> {
|
||||||
|
|
||||||
|
/// Copy with zero fill
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||||
|
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||||
|
|
||||||
|
if (pred_guard) {
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
AccessType zeros;
|
||||||
|
zeros.clear();
|
||||||
|
*static_cast<AccessType *>(smem_ptr) = zeros;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
|
||||||
|
template <bool GlobalToShared>
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
void copy_fence() {}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
void copy_fence<true>() {
|
||||||
|
cp_async_fence();
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <int N>
|
||||||
|
struct copy_wait<N, false> {
|
||||||
|
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_wait() {}
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Partial specialization
|
||||||
|
template <int N>
|
||||||
|
struct copy_wait<N, true> {
|
||||||
|
|
||||||
|
CUTLASS_DEVICE
|
||||||
|
copy_wait() { cp_async_wait<N>(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
} // namespace arch
|
||||||
|
} // namespace cutlass
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user