mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-03 15:56:49 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
29
.clang-format
Normal file
29
.clang-format
Normal file
@@ -0,0 +1,29 @@
|
||||
# This file is used by clang-format to autoformat paddle source code
|
||||
#
|
||||
# The clang-format is part of llvm toolchain.
|
||||
# It need to install llvm and clang to format source code style.
|
||||
#
|
||||
# The basic usage is,
|
||||
# clang-format -i -style=file PATH/TO/SOURCE/CODE
|
||||
#
|
||||
# The -style=file implicit use ".clang-format" file located in one of
|
||||
# parent directory.
|
||||
# The -i means inplace change.
|
||||
#
|
||||
# The document of clang-format is
|
||||
# http://clang.llvm.org/docs/ClangFormat.html
|
||||
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
|
||||
---
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
IndentWidth: 4
|
||||
TabWidth: 2
|
||||
ContinuationIndentWidth: 4
|
||||
AccessModifierOffset: -1 # The private/protected/public has no indent in class
|
||||
Standard: Cpp11
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
BinPackParameters: false
|
||||
BinPackArguments: false
|
||||
IncludeBlocks: Preserve
|
||||
IncludeIsMainSourceRegex: (\.cu)$
|
||||
...
|
6
.gitignore
vendored
6
.gitignore
vendored
@@ -121,7 +121,7 @@ dmypy.json
|
||||
FETCH_HEAD
|
||||
|
||||
#log
|
||||
log/
|
||||
log*/
|
||||
|
||||
checkpoints/
|
||||
checkpoints_origin/
|
||||
@@ -158,3 +158,7 @@ custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
|
||||
|
||||
# buff
|
||||
custom_ops/tmp*
|
||||
|
||||
build
|
||||
|
||||
.ccls-cache
|
||||
|
@@ -16,7 +16,7 @@ repos:
|
||||
rev: v0.11.7
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--output-format, github, --fix]
|
||||
args: [--output-format, github, --fix, --line-length=120]
|
||||
# # 拼写检查
|
||||
# - repo: https://github.com/codespell-project/codespell
|
||||
# rev: v2.4.1
|
||||
@@ -29,14 +29,15 @@ repos:
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
- id: isort
|
||||
# 格式化
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v20.1.3
|
||||
hooks:
|
||||
- id: clang-format
|
||||
# exclude: '.*'
|
||||
types_or: [c++, cuda]
|
||||
args: [--style=file, --verbose]
|
||||
# # 格式化
|
||||
# - repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
# rev: v20.1.3
|
||||
# hooks:
|
||||
# - id: clang-format
|
||||
# # exclude: '.*'
|
||||
# types_or: [c++, cuda]
|
||||
# args: [--style=file, --verbose]
|
||||
|
||||
# markdown
|
||||
- repo: https://github.com/jackdewinter/pymarkdown
|
||||
rev: v0.9.29
|
||||
|
156
README.md
156
README.md
@@ -1,9 +1,8 @@
|
||||
# FastDeploy 2.0: 大模型推理部署
|
||||
|
||||
<p align="center">
|
||||
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
|
||||
<a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
|
||||
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
|
||||
@@ -11,105 +10,78 @@
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
|
||||
</p>
|
||||
|
||||
FastDeploy升级2.0版本支持多种大模型推理(当前仅支持Qwen2,更多模型即将更新支持),其推理部署功能涵盖:
|
||||
<p align="center">
|
||||
<a href="docs/get_started/installation/README.md"><b> Installation </b></a>
|
||||
|
|
||||
<a href="docs/get_started.md"><b> Quick Start </b></a>
|
||||
|
|
||||
<a href="docs/supported_models.md"><b> Supported Models </b></a>
|
||||
</p>
|
||||
|
||||
- 一行命令即可快速实现模型的服务化部署,并支持流式生成
|
||||
- 利用张量并行技术加速模型推理
|
||||
- 支持 PagedAttention 与 continuous batching(动态批处理)
|
||||
- 兼容 OpenAI 的 HTTP 协议
|
||||
- 提供 Weight only int8/int4 无损压缩方案
|
||||
- 支持 Prometheus Metrics 指标
|
||||
--------------------------------------------------------------------------------
|
||||
# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
|
||||
|
||||
> 注意: 如果你还在使用FastDeploy部署小模型(如PaddleClas/PaddleOCR等CV套件模型),请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
|
||||
## News
|
||||
|
||||
## 环境依赖
|
||||
- A800/H800/H100
|
||||
- Python>=3.10
|
||||
- CUDA>=12.3
|
||||
- CUDNN>=9.5
|
||||
- Linux X64
|
||||
**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
|
||||
|
||||
## 安装
|
||||
## About
|
||||
|
||||
### Docker安装(推荐)
|
||||
```
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy:2.0.0.0-alpha
|
||||
```
|
||||
**FastDeploy** is an inference and deployment toolkit for large language models and visual language models based on PaddlePaddle. It delivers **production-ready, out-of-the-box deployment solutions** with core acceleration technologies:
|
||||
|
||||
### 源码安装
|
||||
#### 安装PaddlePaddle
|
||||
> 注意安装nightly build版本,代码版本需新于2025.05.30,详见[PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html),指定安装CUDA 12.6 develop(Nightly build)版本。
|
||||
```
|
||||
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
```
|
||||
- 🚀 **Load-Balanced PD Disaggregation**: Industrial-grade solution featuring context caching and dynamic instance role switching. Optimizes resource utilization while balancing SLO compliance and throughput.
|
||||
- 🔄 **Unified KV Cache Transmission**: Lightweight high-performance transport library with intelligent NVLink/RDMA selection.
|
||||
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
|
||||
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
|
||||
- ⏩ **Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
|
||||
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
|
||||
|
||||
#### 编译安装FastDeploy
|
||||
## Requirements
|
||||
|
||||
```
|
||||
# 编译
|
||||
cd FastDeploy
|
||||
bash build.sh
|
||||
# 安装
|
||||
pip install dist/fastdeploy-2.0.0a0-py3-none-any.whl
|
||||
```
|
||||
- OS: Linux
|
||||
- Python: 3.10 ~ 3.12
|
||||
|
||||
## 快速使用
|
||||
## Installation
|
||||
|
||||
在安装后,执行如下命令快速部署Qwen2模型, 更多参数的配置与含义参考[参数说明](docs/serving.md).
|
||||
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
|
||||
|
||||
``` shell
|
||||
# 下载与解压Qwen模型
|
||||
wget https://fastdeploy.bj.bcebos.com/llm/models/Qwen2-7B-Instruct.tar.gz && tar xvf Qwen2-7B-Instruct.tar.gz
|
||||
# 指定单卡部署
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model ./Qwen2-7B-Instruct --port 8188 --tensor-parallel-size 1
|
||||
```
|
||||
- [NVIDIA GPU](./docs/installation/nvidia_cuda.md)
|
||||
- [Kunlunxin XPU](./docs/en/get_started/installation/kunlunxin_xpu.md)
|
||||
- [Iluvatar GPU](./docs/en/get_started/installation/iluvatar_gpu.md)
|
||||
- [Enflame GCU](./docs/en/get_started/installation/Enflame_gcu.md)
|
||||
|
||||
使用如下命令请求模型服务
|
||||
``` shell
|
||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "你好,你的名字是什么?"}
|
||||
]
|
||||
}'
|
||||
```
|
||||
响应结果如下所示
|
||||
``` json
|
||||
{
|
||||
"id": "chatcmpl-db662f47-7c8c-4945-9a7a-db563b2ddd8d",
|
||||
"object": "chat.completion",
|
||||
"created": 1749451045,
|
||||
"model": "default",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "你好!我叫通义千问。",
|
||||
"reasoning_content": null
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 25,
|
||||
"total_tokens": 35,
|
||||
"completion_tokens": 10,
|
||||
"prompt_tokens_details": null
|
||||
}
|
||||
}
|
||||
```
|
||||
FastDeploy提供与OpenAI完全兼容的服务API(字段`model`与`api_key`目前不支持,设定会被忽略),用户也可基于openai python api请求服务。
|
||||
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
|
||||
|
||||
## 部署文档
|
||||
- [本地部署](docs/offline_inference.md)
|
||||
- [服务部署](docs/serving.md)
|
||||
- [服务metrics](docs/metrics.md)
|
||||
## Get Started
|
||||
|
||||
# 代码说明
|
||||
- [代码目录说明](docs/code_guide.md)
|
||||
- FastDeploy的使用中存在任何建议和问题,欢迎通过issue反馈。
|
||||
Learn how to use FastDeploy through our documentation:
|
||||
- [10-Minutes Quick Deployment](./docs/get_started/quick_start.md)
|
||||
- [ERNIE-4.5 Large Language Model Deployment](./docs/get_started/ernie-4.5.md)
|
||||
- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
|
||||
- [Offline Inference Development](./docs/offline_inference.md)
|
||||
- [Online Service Deployment](./docs/serving/README.md)
|
||||
- [Full Supported Models List](./docs/supported_models.md)
|
||||
|
||||
# 开源说明
|
||||
FastDeploy遵循[Apache-2.0开源协议](./LICENSE)。 在本项目的开发中,为了对齐[vLLM](https://github.com/vllm-project/vllm)使用接口,参考和直接使用了部分vLLM代码,在此表示感谢。
|
||||
## Supported Models
|
||||
|
||||
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅(WINT4/W4A8C8/Expert Parallelism)| ✅ | ✅|✅(WINT4)| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅(WINT4/Expert Parallelism)| ✅ | ✅|✅(WINT4)| ❌ | 128K |
|
||||
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
- [Quantization](./docs/quantization/README.md)
|
||||
- [PD Disaggregation Deployment](./docs/features/pd_disaggregation.md)
|
||||
- [Speculative Decoding](./docs/features/speculative_decoding.md)
|
||||
- [Prefix Caching](./docs/features/prefix_caching.md)
|
||||
- [Chunked Prefill](./docs/features/chunked_prefill.md)
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
FastDeploy is licensed under the [Apache-2.0 open-source license](./LICENSE). During development, portions of [vLLM](https://github.com/vllm-project/vllm) code were referenced and incorporated to maintain interface compatibility, for which we express our gratitude.
|
||||
|
106
benchmarks/README.md
Normal file
106
benchmarks/README.md
Normal file
@@ -0,0 +1,106 @@
|
||||
### FastDeploy服务化性能压测工具
|
||||
|
||||
#### 数据集:
|
||||
|
||||
wget下载到本地用于性能测试
|
||||
|
||||
<table style="width:100%; border-collapse: collapse;">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width:15%; text-align: left;">Dataset</th>
|
||||
<th style="width:65%; text-align: left;">Data Path</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><strong>开源数据集 2k条</strong></td>
|
||||
<td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
#### 使用方式:
|
||||
|
||||
```
|
||||
# 安装依赖
|
||||
python -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
##### 参数说明
|
||||
|
||||
```bash
|
||||
--backend openai-chat:压测使用的后端接口,指定为"openai-chat"使用chat/completion接口
|
||||
--model EB45T:模型名,任意取名,影响最后保存的结果文件名 EB45T \
|
||||
--endpoint /v1/chat/completions:endpoint,用于组url
|
||||
--host 0.0.0.0:服务ip地址,用于组url
|
||||
--port 9812:服务HTTP端口,用于组url
|
||||
--dataset-name EBChat:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集
|
||||
--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd:压测数据集路径
|
||||
--hyperparameter-path EB45T.yaml:(可选)超参文件,请求时会更新进payload中,默认不带任何超参
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len:性能结果中展示的指标集合
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
|
||||
--num-prompts 1:总计发送多少条请求
|
||||
--max-concurrency 1:压测并发数
|
||||
--save-result:开启结果保存,结果文件会存入json
|
||||
```
|
||||
|
||||
##### /v1/chat/completions接口压测单条数据调试
|
||||
|
||||
```
|
||||
python benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model EB45T \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 1 \
|
||||
--max-concurrency 1 \
|
||||
--save-result
|
||||
```
|
||||
|
||||
##### /v1/chat/completions接口完整100并发 2000条压测
|
||||
|
||||
```
|
||||
# 保存infer_log.txt
|
||||
python benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model EB45T \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 2000 \
|
||||
--max-concurrency 100 \
|
||||
--save-result > infer_log.txt 2>&1 &
|
||||
```
|
||||
|
||||
##### /v1/completions接口压测
|
||||
|
||||
修改endpoint为/v1/completions,backend为openai,会对/v1/completions接口进行压测
|
||||
|
||||
```
|
||||
# 保存infer_log.txt
|
||||
python benchmark_serving.py \
|
||||
--backend openai \
|
||||
--model EB45T \
|
||||
--endpoint /v1/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port 9812 \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts 2000 \
|
||||
--max-concurrency 100 \
|
||||
--save-result > infer_log.txt 2>&1 &
|
||||
```
|
||||
|
700
benchmarks/backend_request_func.py
Normal file
700
benchmarks/backend_request_func.py
Normal file
@@ -0,0 +1,700 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
|
||||
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
|
||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestFuncInput:
|
||||
"""Input for requesting LLMs via API"""
|
||||
prompt: str
|
||||
history_QA: Optional[dict]
|
||||
hyper_parameters: dict
|
||||
api_url: str
|
||||
prompt_len: int
|
||||
output_len: int
|
||||
model: str
|
||||
model_name: Optional[str] = None
|
||||
logprobs: Optional[int] = None
|
||||
extra_body: Optional[dict] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
ignore_eos: bool = False
|
||||
language: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RequestFuncOutput:
|
||||
"""Output for requesting LLMs via API"""
|
||||
generated_text: str = ""
|
||||
reasoning_content: str = ""
|
||||
success: bool = False
|
||||
latency: float = 0.0
|
||||
output_tokens: int = 0
|
||||
ttft: float = 0.0 # Time to first token
|
||||
arrival_time: list = field(default_factory=list) # arrival_time
|
||||
itl: list = field(default_factory=list) # list of inter-token latencies
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
prompt_tokens: int = 0 # 推理侧返回输入token数
|
||||
error: str = ""
|
||||
|
||||
|
||||
async def async_request_eb_openai_chat_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using EB OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Chat Completions API URL must end with 'completions'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
content.append(request_func_input.multi_modal_content)
|
||||
payload = {
|
||||
"model": "default",
|
||||
"messages": request_func_input.history_QA,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
payload.update(request_func_input.hyper_parameters)
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = 0
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get("content")
|
||||
reason_content = choices[0]["delta"].get("reasoning_content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
# cached_tokens
|
||||
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
output.generated_text += content or ""
|
||||
output.reasoning_content += reason_content or ""
|
||||
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
# output.generated_text = generated_text
|
||||
if output.generated_text.strip() == "":
|
||||
output.success = False
|
||||
output.error = "No generated text found!"
|
||||
else:
|
||||
output.success = True
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
error_text = await response.text()
|
||||
print("####error response:", error_text, "####payload:", payload)
|
||||
output.error = error_text or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
# 保存失败请求结果
|
||||
if not output.success:
|
||||
with open("error_output.txt", "a") as f:
|
||||
f.write(str(output) + "\n")
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_eb_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using EB OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": "default",
|
||||
"prompt": request_func_input.prompt,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
"continuous_usage_stats": True
|
||||
},
|
||||
}
|
||||
# 超参由yaml传入
|
||||
payload.update(request_func_input.hyper_parameters)
|
||||
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, chunk.usage)
|
||||
data = json.loads(chunk)
|
||||
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# want to check a token was generated
|
||||
if choices := data.get("choices"):
|
||||
# Note that text could be empty here
|
||||
# e.g. for special tokens
|
||||
text = choices[0].get("text")
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(choices[0].get("arrival_time"))
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.prompt_tokens = usage.get(
|
||||
"prompt_tokens")
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_tgi(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using the TGI API"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
params = {
|
||||
"max_new_tokens": request_func_input.output_len,
|
||||
"do_sample": True,
|
||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||
"truncate": request_func_input.prompt_len,
|
||||
"ignore_eos_token": request_func_input.ignore_eos,
|
||||
}
|
||||
payload = {
|
||||
"inputs": request_func_input.prompt,
|
||||
"parameters": params,
|
||||
}
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
if request_func_input.ignore_eos:
|
||||
output.output_tokens = request_func_input.output_len
|
||||
else:
|
||||
output.output_tokens = None
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||
|
||||
# NOTE: Sometimes TGI returns a ping response without
|
||||
# any data, we should skip it.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
chunk = chunk_bytes.removeprefix("data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
output.arrival_time.append(data["arrival_time"])
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.success = True
|
||||
output.generated_text = data["generated_text"]
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_trt_llm(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using TRT's llm_server"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith("generate_stream")
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"accumulate_tokens": True,
|
||||
"text_input": request_func_input.prompt,
|
||||
"temperature": 0.0,
|
||||
"top_p": 1.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["min_length"] = request_func_input.output_len
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data:")
|
||||
|
||||
data = json.loads(chunk)
|
||||
output.generated_text += data["text_output"]
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.success = True
|
||||
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_deepspeed_mii(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using Deepspeed MII"""
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
|
||||
payload = {
|
||||
"prompt": request_func_input.prompt,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||
"top_p": 1.0,
|
||||
}
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
|
||||
# will use 0 as placeholder.
|
||||
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
|
||||
output.ttft = 0
|
||||
|
||||
st = time.perf_counter()
|
||||
try:
|
||||
async with session.post(url=request_func_input.api_url,
|
||||
json=payload) as response:
|
||||
if response.status == 200:
|
||||
parsed_resp = await response.json()
|
||||
output.latency = time.perf_counter() - st
|
||||
if "choices" in parsed_resp:
|
||||
output.generated_text = parsed_resp["choices"][0][
|
||||
"text"]
|
||||
elif "text" in parsed_resp:
|
||||
output.generated_text = parsed_resp["text"][0]
|
||||
else:
|
||||
output.error = ("Unexpected response format: "
|
||||
"neither 'choices' nor 'text' found")
|
||||
output.success = False
|
||||
output.success = True
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_openai_completions(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using OpenAI"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
# "temperature": 0.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
#"stream_options": {
|
||||
# "include_usage": True,
|
||||
#},
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
# print("####chunk:", chunk, type(chunk))
|
||||
data = json.loads(chunk)
|
||||
|
||||
# NOTE: Some completion API might have a last
|
||||
# usage summary response without a token so we
|
||||
# want to check a token was generated
|
||||
if choices := data.get("choices"):
|
||||
# Note that text could be empty here
|
||||
# e.g. for special tokens
|
||||
text = choices[0].get("text")
|
||||
timestamp = time.perf_counter()
|
||||
# First token
|
||||
if not first_chunk_received:
|
||||
first_chunk_received = True
|
||||
ttft = time.perf_counter() - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
async def async_request_openai_audio(
|
||||
request_func_input: RequestFuncInput,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> RequestFuncOutput:
|
||||
"""Request an LLM using OpenAI"""
|
||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||
import soundfile
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("transcriptions", "translations"
|
||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
"or `translations`."
|
||||
|
||||
async with aiohttp.ClientSession(trust_env=True,
|
||||
timeout=AIOHTTP_TIMEOUT) as session:
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
payload = {
|
||||
"model": request_func_input.model_name \
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"language": "en",
|
||||
# Flattened due to multipart/form-data
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True
|
||||
}
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||
}
|
||||
|
||||
# Send audio file
|
||||
def to_bytes(y, sr):
|
||||
buffer = io.BytesIO()
|
||||
soundfile.write(buffer, y, sr, format="WAV")
|
||||
buffer.seek(0)
|
||||
return buffer
|
||||
|
||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
||||
form = aiohttp.FormData()
|
||||
form.add_field('file', f, content_type='audio/wav')
|
||||
for key, value in payload.items():
|
||||
form.add_field(key, str(value))
|
||||
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
generated_text = ""
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url,
|
||||
data=form,
|
||||
headers=headers) as response:
|
||||
if response.status == 200:
|
||||
async for chunk_bytes in response.content:
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get(
|
||||
"content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
output.ttft = ttft
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(
|
||||
timestamp - most_recent_timestamp)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.generated_text = generated_text
|
||||
output.success = True
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
except Exception:
|
||||
output.success = False
|
||||
exc_info = sys.exc_info()
|
||||
output.error = "".join(traceback.format_exception(*exc_info))
|
||||
|
||||
if pbar:
|
||||
pbar.update(1)
|
||||
return output
|
||||
|
||||
|
||||
ASYNC_REQUEST_FUNCS = {
|
||||
"tgi": async_request_tgi,
|
||||
"vllm": async_request_openai_completions,
|
||||
"lmdeploy": async_request_openai_completions,
|
||||
"deepspeed-mii": async_request_deepspeed_mii,
|
||||
"openai": async_request_eb_openai_completions,
|
||||
"openai-chat": async_request_eb_openai_chat_completions,
|
||||
"openai-audio": async_request_openai_audio,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions,
|
||||
async_request_eb_openai_chat_completions)
|
||||
]
|
||||
|
309
benchmarks/benchmark_dataset.py
Normal file
309
benchmarks/benchmark_dataset.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
|
||||
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from typing import Any, Callable, Optional, Union
|
||||
from PIL import Image
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SampleRequest:
|
||||
"""
|
||||
Represents a single inference request for benchmarking.
|
||||
"""
|
||||
|
||||
prompt: Union[str, Any]
|
||||
history_QA: Union[str, Any]
|
||||
json_data: Optional[dict]
|
||||
prompt_len: int
|
||||
expected_output_len: int
|
||||
|
||||
|
||||
class BenchmarkDataset(ABC):
|
||||
"""BenchmarkDataset"""
|
||||
DEFAULT_SEED = 0
|
||||
IS_MULTIMODAL = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_path: Optional[str] = None,
|
||||
random_seed: int = DEFAULT_SEED,
|
||||
hyperparameter_path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the BenchmarkDataset with an optional dataset path and random
|
||||
seed. Args:
|
||||
dataset_path (Optional[str]): Path to the dataset. If None, it
|
||||
indicates that a default or random dataset might be used.
|
||||
random_seed (int): Seed value for reproducible shuffling or
|
||||
sampling. Defaults to DEFAULT_SEED.
|
||||
"""
|
||||
self.dataset_path = dataset_path
|
||||
# Set the random seed, ensuring that a None value is replaced with the
|
||||
# default seed.
|
||||
self.random_seed = (random_seed
|
||||
if random_seed is not None else self.DEFAULT_SEED)
|
||||
self.data = None
|
||||
self.hyperparameter_path = hyperparameter_path
|
||||
self.hyperparameters = {}
|
||||
|
||||
def load_data(self) -> None:
|
||||
"""
|
||||
Load data from the dataset path into self.data.
|
||||
|
||||
This method must be overridden by subclasses since the method to load
|
||||
data will vary depending on the dataset format and source.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If a subclass does not implement this method.
|
||||
"""
|
||||
# TODO (jenniferzhao): add support for downloading data
|
||||
raise NotImplementedError(
|
||||
"load_data must be implemented in subclasses.")
|
||||
|
||||
@abstractmethod
|
||||
def sample(self, num_requests: int) -> list[SampleRequest]:
|
||||
"""
|
||||
Abstract method to generate sample requests from the dataset.
|
||||
|
||||
Subclasses must override this method to implement dataset-specific logic
|
||||
for generating a list of SampleRequest objects.
|
||||
|
||||
Args:
|
||||
num_requests (int): The number of sample requests to generate.
|
||||
|
||||
Returns:
|
||||
list[SampleRequest]: A list of sample requests generated from the
|
||||
dataset.
|
||||
"""
|
||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||
|
||||
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||
num_requests: int) -> None:
|
||||
"""
|
||||
Oversamples the list of requests if its size is less than the desired
|
||||
number.
|
||||
|
||||
Args:
|
||||
requests (List[SampleRequest]): The current list of sampled
|
||||
requests. num_requests (int): The target number of requests.
|
||||
"""
|
||||
if len(requests) < num_requests:
|
||||
random.seed(self.random_seed)
|
||||
additional = random.choices(requests,
|
||||
k=num_requests - len(requests))
|
||||
requests.extend(additional)
|
||||
logger.info("Oversampled requests to reach %d total samples.",
|
||||
num_requests)
|
||||
|
||||
|
||||
def is_valid_sequence(
|
||||
prompt_len: int,
|
||||
output_len: int,
|
||||
min_len: int = 4,
|
||||
max_prompt_len: int = 1024,
|
||||
max_total_len: int = 2048,
|
||||
skip_min_output_len_check: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Validate a sequence based on prompt and output lengths.
|
||||
|
||||
Default pruning criteria are copied from the original `sample_hf_requests`
|
||||
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
|
||||
from `sample_requests` in benchmark_throughput.py.
|
||||
"""
|
||||
# Check for invalid conditions
|
||||
prompt_too_short = prompt_len < min_len
|
||||
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||
< min_len)
|
||||
prompt_too_long = prompt_len > max_prompt_len
|
||||
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||
|
||||
# Return True if none of the invalid conditions are met
|
||||
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||
or combined_too_long)
|
||||
|
||||
|
||||
def process_image(image: Any) -> Mapping[str, Any]:
|
||||
"""
|
||||
Process a single image input and return a multimedia content dictionary.
|
||||
|
||||
Supports three input types:
|
||||
|
||||
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
||||
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
||||
|
||||
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
||||
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
||||
a dictionary with the image as a base64 data URL.
|
||||
|
||||
3. String input: - Treats the string as a URL or local file path. -
|
||||
Prepends "file://" if the string doesn't start with "http://" or
|
||||
"file://". - Returns a dictionary with the image URL.
|
||||
|
||||
Raises:
|
||||
ValueError: If the input is not a supported type.
|
||||
"""
|
||||
if isinstance(image, dict) and 'bytes' in image:
|
||||
image = Image.open(BytesIO(image['bytes']))
|
||||
if isinstance(image, Image.Image):
|
||||
image = image.convert("RGB")
|
||||
with io.BytesIO() as image_data:
|
||||
image.save(image_data, format="JPEG")
|
||||
image_base64 = base64.b64encode(
|
||||
image_data.getvalue()).decode("utf-8")
|
||||
return {
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||
},
|
||||
}
|
||||
|
||||
if isinstance(image, str):
|
||||
image_url = (image if image.startswith(
|
||||
("http://", "file://")) else f"file://{image}")
|
||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||
|
||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||
" or str or dictionary with raw image bytes.")
|
||||
|
||||
|
||||
class EBDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||
sample requests based on conversation turns.
|
||||
"""
|
||||
|
||||
temperature: float
|
||||
repetition_penalty: float
|
||||
frequency_penalty: float
|
||||
presence_penalty: float
|
||||
top_p: float
|
||||
prompt_len: int
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
with open(self.dataset_path, encoding="utf-8") as f:
|
||||
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||
|
||||
def sample(
|
||||
self,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
for entry in self.data:
|
||||
if len(samples) >= num_requests:
|
||||
break
|
||||
prompt = entry["text"]
|
||||
self.temperature = float(entry["temperature"])
|
||||
self.repetition_penalty = float(entry["penalty_score"])
|
||||
self.frequency_penalty = float(entry["frequency_score"])
|
||||
self.presence_penalty = float(entry["presence_score"])
|
||||
self.top_p = float(entry["topp"])
|
||||
self.prompt_len = int(entry["input_token_num"])
|
||||
new_output_len = int(entry["max_dec_len"])
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=self.prompt_len,
|
||||
history_QA=[],
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
||||
|
||||
class EBChatDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||
sample requests based on conversation turns.
|
||||
"""
|
||||
prompt_len: int
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
with open(self.dataset_path, encoding="utf-8") as f:
|
||||
self.data = [json.loads(i.strip()) for i in f.readlines()]
|
||||
|
||||
def sample(
|
||||
self,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
samples: list = []
|
||||
for entry in self.data:
|
||||
if len(samples) >= num_requests:
|
||||
break
|
||||
json_data = entry
|
||||
prompt = entry["messages"][-1].get("content", "")
|
||||
history_QA = entry.get("messages", [])
|
||||
new_output_len = int(entry.get("max_tokens", 12288))
|
||||
|
||||
if enable_multimodal_chat:
|
||||
prompt = self.apply_multimodal_chat_transformation(
|
||||
prompt, None)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
json_data=json_data,
|
||||
prompt=prompt,
|
||||
prompt_len=0,
|
||||
history_QA=history_QA,
|
||||
expected_output_len=new_output_len,
|
||||
))
|
||||
|
||||
self.maybe_oversample_requests(samples, num_requests)
|
||||
return samples
|
||||
|
1141
benchmarks/benchmark_serving.py
Normal file
1141
benchmarks/benchmark_serving.py
Normal file
File diff suppressed because it is too large
Load Diff
90
benchmarks/benchmark_utils.py
Normal file
90
benchmarks/benchmark_utils.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
|
||||
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
extra_info: dict[str, Any]) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
"""
|
||||
records = []
|
||||
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||
return records
|
||||
|
||||
for name, benchmark_values in metrics.items():
|
||||
record = {
|
||||
"benchmark": {
|
||||
"name": "vLLM benchmark",
|
||||
"extra_info": {
|
||||
"args": vars(args),
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"name": args.model,
|
||||
},
|
||||
"metric": {
|
||||
"name": name,
|
||||
"benchmark_values": benchmark_values,
|
||||
"extra_info": extra_info,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||
"tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"][
|
||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
"""InfEncoder"""
|
||||
def clear_inf(self, o: Any):
|
||||
"""clear_inf"""
|
||||
if isinstance(o, dict):
|
||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||
elif isinstance(o, list):
|
||||
return [self.clear_inf(v) for v in o]
|
||||
elif isinstance(o, float) and math.isinf(o):
|
||||
return "inf"
|
||||
return o
|
||||
|
||||
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||
"""iterencode"""
|
||||
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
"""write_to_json"""
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
|
5
benchmarks/requirements.txt
Normal file
5
benchmarks/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
aiohttp
|
||||
tqdm
|
||||
numpy
|
||||
Pillow
|
||||
pyyaml
|
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_chunked_prefill: True
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 16
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 8
|
||||
max_num_batched_tokens: 4096
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 40
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
8
benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_chunked_prefill: True
|
||||
max_model_len: 131072
|
||||
max_num_seqs: 16
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 8
|
||||
max_num_batched_tokens: 4096
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
10
benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 1
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-45-vl
|
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
5
benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
quantization: wint4
|
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
6
benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
||||
quantization: wint8
|
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
max_num_batched_tokens: 32768
|
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
12
benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
tensor_parallel_size: 8
|
||||
quantization: block_wise_fp8
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 1024
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
enable_prefix_caching: True
|
||||
swap_space: 200
|
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
tensor_parallel_size: 8
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 1024
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
||||
enable_prefix_caching: True
|
||||
swap_space: 200
|
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: True
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: True
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_prefix_caching: true
|
||||
enable_chunked_prefill: true
|
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 1
|
||||
data_parallel_size: 8
|
||||
num_gpu_blocks_override: 1024
|
||||
cache_queue_port: 55663
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma"
|
||||
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||
pd_comm_port: "2334"
|
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 1
|
||||
data_parallel_size: 8
|
||||
splitwise_role: prefill
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
num_gpu_blocks_override: 1024
|
||||
cache_transfer_protocol: "rdma"
|
||||
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
|
||||
pd_comm_port: "2334"
|
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
6
benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
13
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.7
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: False
|
||||
enable_prefix_caching: False
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: False
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 40
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
gpu_memory_utilization: 0.9
|
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 160
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
gpu_memory_utilization: 0.9
|
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
8
benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
enable_prefix_caching: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
15
benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
cache_queue_port: 55663
|
||||
enable_chunked_prefill: True
|
||||
splitwise_role: decode
|
||||
engine_worker_queue_port: 6678
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7671,7672,7673,7674"
|
||||
pd_comm_port: "2334"
|
||||
max_num_batched_tokens: 384
|
||||
max_num_partial_prefills: 3
|
||||
max_long_partial_prefills: 3
|
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
12
benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 16
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.9
|
||||
tensor_parallel_size: 4
|
||||
splitwise_role: prefill
|
||||
enable_prefix_caching: True
|
||||
cache_queue_port: 55664
|
||||
engine_worker_queue_port: 6677
|
||||
cache_transfer_protocol: "rdma,ipc"
|
||||
rdma_comm_ports: "7675,7676,7677,7678"
|
||||
pd_comm_port: "2333"
|
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 8
|
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
5
benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 80
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
gpu_memory_utilization: 0.9
|
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_prefix_caching: True
|
||||
max_model_len: 32768
|
||||
max_num_batched_tokens: 68304
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 100
|
||||
cache_queue_port: 55664
|
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 56
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 56
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.95
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
11
benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
enable_chunked_prefill: True
|
||||
max_num_batched_tokens: 384
|
||||
reasoning_parser: ernie-45-vl
|
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
9
benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
enable_mm: True
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 36
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.8
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint8
|
||||
limit_mm_per_prompt: '{"image": 100, "video": 100}'
|
||||
reasoning_parser: ernie-45-vl
|
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 96
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.71
|
||||
tensor_parallel_size: 4
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
4
benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wfp8afp8
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wfp8afp8
|
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint8
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
5
benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
enable_static_graph_inference: True
|
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
6
benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
enable_static_graph_inference: True
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 256
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 75
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 4
|
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
6
benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 25
|
||||
gpu_memory_utilization: 0.9
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint8
|
||||
tensor_parallel_size: 4
|
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
5
benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 1
|
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
6
benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 50
|
||||
gpu_memory_utilization: 0.8
|
||||
kv_cache_ratio: 0.75
|
||||
quantization: wint4
|
||||
tensor_parallel_size: 1
|
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-128k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.8
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 131071
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/eb45-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.8
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen2-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.7
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.05
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/qwen3-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.8
|
||||
temperature: 0.7
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 12288
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 1.5
|
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
8
benchmarks/yaml/request_yaml/x1-32k.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
top_p: 0.95
|
||||
temperature: 0.6
|
||||
metadata:
|
||||
min_tokens: 1
|
||||
max_tokens: 32767
|
||||
repetition_penalty: 1.0
|
||||
frequency_penalty: 0
|
||||
presence_penalty: 0
|
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
tensor_parallel_size: 8
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
num_gpu_blocks_override: 4096
|
||||
kv_cache_ratio: 0.5
|
||||
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint4
|
||||
reasoning_parser: ernie-x1
|
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_prefix_caching: True
|
||||
num_gpu_blocks_override: 8000
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
||||
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
tensor_parallel_size: 8
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 32
|
||||
num_gpu_blocks_override: 4096
|
||||
kv_cache_ratio: 0.5
|
||||
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 8
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 4
|
||||
quantization: wint8
|
||||
reasoning_parser: ernie-x1
|
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
6
benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.9
|
||||
tensor_parallel_size: 8
|
||||
quantization: wint8
|
||||
reasoning_parser: ernie-x1
|
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
10
benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
enable_prefix_caching: True
|
||||
num_gpu_blocks_override: 8000
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 64
|
||||
gpu_memory_utilization: 0.85
|
||||
kv_cache_ratio: 0.5
|
||||
tensor_parallel_size: 8
|
||||
swap_space: 200
|
||||
cache_queue_port: 55664
|
||||
reasoning_parser: ernie-x1
|
66
build.sh
66
build.sh
@@ -17,8 +17,9 @@
|
||||
BUILD_WHEEL=${1:-1}
|
||||
PYTHON_VERSION=${2:-"python"}
|
||||
export python=$PYTHON_VERSION
|
||||
CPU_USE_BF16=${3:-"false"}
|
||||
BUILDING_ARCS=${4:-""}
|
||||
FD_CPU_USE_BF16=${3:-"false"}
|
||||
FD_BUILDING_ARCS=${4:-""}
|
||||
|
||||
|
||||
# paddle distributed use to set archs
|
||||
unset PADDLE_CUDA_ARCH_LIST
|
||||
@@ -30,13 +31,9 @@ EGG_DIR="fastdeploy.egg-info"
|
||||
|
||||
# custom_ops directory config
|
||||
OPS_SRC_DIR="custom_ops"
|
||||
OPS_BUILD_DIR="build"
|
||||
OPS_EGG_DIR="efficitentllm_ops.egg-info"
|
||||
OPS_TMP_DIR_BASE="tmp_base"
|
||||
OPS_TMP_DIR="tmp"
|
||||
|
||||
TEST_DIR="tests"
|
||||
|
||||
# command line log config
|
||||
RED='\033[0;31m'
|
||||
BLUE='\033[0;34m'
|
||||
@@ -44,13 +41,14 @@ GREEN='\033[1;32m'
|
||||
BOLD='\033[1m'
|
||||
NONE='\033[0m'
|
||||
|
||||
DEVICE_TYPE="gpu"
|
||||
|
||||
function python_version_check() {
|
||||
PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
|
||||
PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
|
||||
echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
|
||||
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "8" ]; then
|
||||
echo -e "${RED}FAIL:${NONE} please use Python >= 3.8"
|
||||
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "9" ]; then
|
||||
echo -e "${RED}FAIL:${NONE} please use Python >= 3.9"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
@@ -75,6 +73,7 @@ function copy_ops(){
|
||||
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
||||
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
|
||||
if [ "$is_rocm" = "True" ]; then
|
||||
DEVICE_TYPE="rocm"
|
||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||
echo -e "ROCM ops have been copy to fastdeploy"
|
||||
return
|
||||
@@ -82,6 +81,7 @@ function copy_ops(){
|
||||
mkdir -p ../fastdeploy/model_executor/ops/base
|
||||
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
|
||||
if [ "$is_cuda" = "True" ]; then
|
||||
DEVICE_TYPE="gpu"
|
||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||
echo -e "BASE and CUDA ops have been copy to fastdeploy"
|
||||
@@ -90,6 +90,7 @@ function copy_ops(){
|
||||
|
||||
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
||||
if [ "$is_xpu" = "True" ]; then
|
||||
DEVICE_TYPE="xpu"
|
||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
|
||||
echo -e "xpu ops have been copy to fastdeploy"
|
||||
return
|
||||
@@ -97,20 +98,14 @@ function copy_ops(){
|
||||
|
||||
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
|
||||
if [ "$is_npu" = "True" ]; then
|
||||
DEVICE_TYPE="npu"
|
||||
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
|
||||
echo -e "npu ops have been copy to fastdeploy"
|
||||
return
|
||||
fi
|
||||
|
||||
DEVICE_TYPE="cpu"
|
||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
||||
cd ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/xFasterTransformer/build/
|
||||
for file in *_pd_.so; do
|
||||
mv "$file" "${file/_pd_/}"
|
||||
done
|
||||
cd ../../x86-simd-sort/builddir/
|
||||
for file in *_pd_.so; do
|
||||
mv "$file" "${file/_pd_/}"
|
||||
done
|
||||
cd ../../../../
|
||||
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
|
||||
echo -e "BASE and CPU ops have been copy to fastdeploy"
|
||||
@@ -122,15 +117,30 @@ function build_and_install_ops() {
|
||||
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
|
||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
|
||||
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
|
||||
find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
|
||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
|
||||
if [ "$CPU_USE_BF16" == "true" ]; then
|
||||
CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
:
|
||||
elif [ "$CPU_USE_BF16" == "false" ]; then
|
||||
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
|
||||
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
||||
if [ "$is_xpu" = "True" ]; then
|
||||
cd xpu_ops/src
|
||||
bash build.sh ${TMP_DIR_REAL_PATH}
|
||||
cd ../..
|
||||
elif [ "$FD_CPU_USE_BF16" == "true" ]; then
|
||||
if [ "$FD_BUILDING_ARCS" == "" ]; then
|
||||
FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
else
|
||||
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
fi
|
||||
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||
elif [ "$FD_CPU_USE_BF16" == "false" ]; then
|
||||
if [ "$FD_BUILDING_ARCS" == "" ]; then
|
||||
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
:
|
||||
else
|
||||
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
|
||||
fi
|
||||
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
|
||||
else
|
||||
echo "Error: Invalid parameter '$CPU_USE_BF16'. Please use true or false."
|
||||
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
|
||||
exit 1
|
||||
fi
|
||||
if [ $? -ne 0 ]; then
|
||||
@@ -146,11 +156,7 @@ function build_and_install_ops() {
|
||||
|
||||
function build_and_install() {
|
||||
echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
|
||||
if [ "$BUILDING_ARCS" == "" ]; then
|
||||
${python} setup.py bdist_wheel --python-tag py3
|
||||
else
|
||||
BUILDING_ARCS=${BUILDING_ARCS} ${python} setup.py bdist_wheel --python-tag py3
|
||||
fi
|
||||
${python} setup.py bdist_wheel --python-tag=py3
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
|
||||
@@ -174,10 +180,12 @@ function cleanup() {
|
||||
rm -rf $BUILD_DIR $EGG_DIR
|
||||
if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0 ]; then
|
||||
echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
|
||||
${python} -m pip uninstall -y fastdeploy
|
||||
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
|
||||
fi
|
||||
|
||||
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
||||
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
|
||||
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
|
||||
}
|
||||
|
||||
function abort() {
|
||||
@@ -187,7 +195,7 @@ function abort() {
|
||||
cur_dir=`basename "$pwd"`
|
||||
|
||||
rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
|
||||
${python} -m pip uninstall -y fastdeploy
|
||||
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
|
||||
|
||||
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
||||
}
|
||||
|
643
custom_ops/0001-DeepGEMM-95e81b3.patch
Normal file
643
custom_ops/0001-DeepGEMM-95e81b3.patch
Normal file
@@ -0,0 +1,643 @@
|
||||
From 5112002c155dceecc5e5983cdb67157e4f5400e2 Mon Sep 17 00:00:00 2001
|
||||
From: minghaipeng <minghaipeng@baidu.com>
|
||||
Date: Wed, 25 Jun 2025 15:05:24 +0800
|
||||
Subject: [PATCH] DeepGEMM 95e81b3
|
||||
|
||||
---
|
||||
deep_gemm/__init__.py | 2 +-
|
||||
deep_gemm/include/deep_gemm/scheduler.cuh | 2 +-
|
||||
deep_gemm/jit/compiler.py | 2 +-
|
||||
deep_gemm/jit/interleave_ffma.py | 2 +-
|
||||
deep_gemm/jit/runtime.py | 4 +-
|
||||
deep_gemm/jit/template.py | 34 ++++----
|
||||
deep_gemm/jit_kernels/gemm.py | 44 +++++------
|
||||
deep_gemm/jit_kernels/m_grouped_gemm.py | 96 +++++++++++------------
|
||||
deep_gemm/jit_kernels/tuner.py | 10 +--
|
||||
deep_gemm/jit_kernels/utils.py | 18 +++--
|
||||
deep_gemm/paddle_utils.py | 20 +++++
|
||||
deep_gemm/utils.py | 30 +++----
|
||||
12 files changed, 143 insertions(+), 121 deletions(-)
|
||||
create mode 100644 deep_gemm/paddle_utils.py
|
||||
|
||||
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
|
||||
index 15b22ca..63e7fb7 100644
|
||||
--- a/deep_gemm/__init__.py
|
||||
+++ b/deep_gemm/__init__.py
|
||||
@@ -1,4 +1,4 @@
|
||||
-import torch
|
||||
+import paddle
|
||||
|
||||
from . import jit
|
||||
from .jit_kernels import (
|
||||
diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||
index 9743871..6c97152 100644
|
||||
--- a/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||
+++ b/deep_gemm/include/deep_gemm/scheduler.cuh
|
||||
@@ -102,7 +102,7 @@ struct Scheduler {
|
||||
if constexpr (kGemmType == GemmType::Normal) {
|
||||
return block_idx * block_size;
|
||||
} else if constexpr (kGemmType == GemmType::GroupedContiguous) {
|
||||
- auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
|
||||
+ auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M));
|
||||
return offset * shape_dim + block_idx * block_size;
|
||||
} else if constexpr (kGemmType == GemmType::GroupedMasked) {
|
||||
return curr_group_idx * shape_dim + block_idx * block_size;
|
||||
diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py
|
||||
index c17d466..6fdc52f 100644
|
||||
--- a/deep_gemm/jit/compiler.py
|
||||
+++ b/deep_gemm/jit/compiler.py
|
||||
@@ -4,7 +4,7 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import uuid
|
||||
-from torch.utils.cpp_extension import CUDA_HOME
|
||||
+from ..paddle_utils import CUDA_HOME
|
||||
from typing import Tuple
|
||||
|
||||
from . import interleave_ffma
|
||||
diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/interleave_ffma.py
|
||||
index fcb377e..db9d6f3 100644
|
||||
--- a/deep_gemm/jit/interleave_ffma.py
|
||||
+++ b/deep_gemm/jit/interleave_ffma.py
|
||||
@@ -3,7 +3,7 @@ import mmap
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
-from torch.utils.cpp_extension import CUDA_HOME
|
||||
+from ..paddle_utils import CUDA_HOME
|
||||
|
||||
|
||||
def run_cuobjdump(file_path):
|
||||
diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py
|
||||
index 66c370a..4761426 100644
|
||||
--- a/deep_gemm/jit/runtime.py
|
||||
+++ b/deep_gemm/jit/runtime.py
|
||||
@@ -1,6 +1,6 @@
|
||||
import ctypes
|
||||
import os
|
||||
-import torch
|
||||
+import paddle
|
||||
from typing import Optional
|
||||
|
||||
from .template import map_ctype
|
||||
@@ -35,7 +35,7 @@ class Runtime:
|
||||
assert len(args) == len(self.args), f'Expected {len(self.args)} arguments, got {len(args)}'
|
||||
cargs = []
|
||||
for arg, (name, dtype) in zip(args, self.args):
|
||||
- if isinstance(arg, torch.Tensor):
|
||||
+ if isinstance(arg, paddle.Tensor):
|
||||
assert arg.dtype == dtype, f'Expected tensor dtype `{dtype}` for `{name}`, got `{arg.dtype}`'
|
||||
else:
|
||||
assert isinstance(arg, dtype), f'Expected built-in type `{dtype}` for `{name}`, got `{type(arg)}`'
|
||||
diff --git a/deep_gemm/jit/template.py b/deep_gemm/jit/template.py
|
||||
index ead37f5..51b02c1 100644
|
||||
--- a/deep_gemm/jit/template.py
|
||||
+++ b/deep_gemm/jit/template.py
|
||||
@@ -1,24 +1,24 @@
|
||||
import copy
|
||||
import ctypes
|
||||
import os
|
||||
-import torch
|
||||
+import paddle
|
||||
from typing import Any, Dict, Iterable, Tuple
|
||||
|
||||
|
||||
# Name map for Python `eval`
|
||||
typename_map: Dict[Any, str] = {
|
||||
**{t: t.__name__ for t in (bool, int, float)},
|
||||
- torch.int: 'torch.int',
|
||||
- torch.float: 'torch.float',
|
||||
- torch.bfloat16: 'torch.bfloat16',
|
||||
- torch.float8_e4m3fn: 'torch.float8_e4m3fn',
|
||||
- torch.cuda.Stream: 'torch.cuda.Stream',
|
||||
+ paddle.int32: 'paddle.int32',
|
||||
+ paddle.float32: 'paddle.float32',
|
||||
+ paddle.bfloat16: 'paddle.bfloat16',
|
||||
+ paddle.float8_e4m3fn: 'paddle.float8_e4m3fn',
|
||||
+ paddle.device.cuda.Stream: "paddle.device.cuda.Stream",
|
||||
}
|
||||
|
||||
# `ctype` map for Python casting
|
||||
ctype_map: Dict[Any, Any] = {
|
||||
**{t: getattr(ctypes, f'c_{t.__name__}') for t in (bool, int, float)},
|
||||
- **{t: ctypes.c_void_p for t in (torch.int, torch.float, torch.bfloat16, torch.float8_e4m3fn, torch.cuda.Stream)},
|
||||
+ **{t: ctypes.c_void_p for t in (paddle.int32, paddle.float32, paddle.bfloat16, paddle.float8_e4m3fn, paddle.device.cuda.Stream)},
|
||||
}
|
||||
|
||||
|
||||
@@ -27,25 +27,25 @@ genc_map = {
|
||||
bool: ('bool', 'bool'),
|
||||
int: ('int', 'int'),
|
||||
float: ('float', 'float'),
|
||||
- torch.int: ('void*', 'int*'),
|
||||
- torch.float: ('void*', 'float*'),
|
||||
- torch.bfloat16: ('void*', '__nv_bfloat16*'),
|
||||
- torch.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
|
||||
- torch.cuda.Stream: ('void*', 'cudaStream_t'),
|
||||
+ paddle.int32: ('void*', 'int*'),
|
||||
+ paddle.float32: ('void*', 'float*'),
|
||||
+ paddle.bfloat16: ('void*', '__nv_bfloat16*'),
|
||||
+ paddle.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
|
||||
+ paddle.device.cuda.Stream: ('void*', 'cudaStream_t'),
|
||||
}
|
||||
|
||||
|
||||
def map_ctype(value: Any) -> Any:
|
||||
if hasattr(value, 'data_ptr'):
|
||||
- if value.dtype == torch.int:
|
||||
+ if value.dtype == paddle.int32:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
- elif value.dtype == torch.float:
|
||||
+ elif value.dtype == paddle.float32:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
- elif value.dtype == torch.bfloat16:
|
||||
+ elif value.dtype == paddle.bfloat16:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
- elif value.dtype == torch.float16:
|
||||
+ elif value.dtype == paddle.float16:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
- elif value.dtype == torch.float8_e4m3fn:
|
||||
+ elif value.dtype == paddle.float8_e4m3fn:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
else:
|
||||
return ctypes.c_void_p(value.data_ptr())
|
||||
diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py
|
||||
index cb438b7..44aa0ed 100644
|
||||
--- a/deep_gemm/jit_kernels/gemm.py
|
||||
+++ b/deep_gemm/jit_kernels/gemm.py
|
||||
@@ -1,5 +1,5 @@
|
||||
import math
|
||||
-import torch
|
||||
+import paddle
|
||||
from functools import lru_cache
|
||||
from typing import Tuple
|
||||
|
||||
@@ -166,20 +166,20 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
|
||||
return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config
|
||||
|
||||
|
||||
-def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- out: torch.Tensor) -> None:
|
||||
+def gemm_fp8_fp8_bf16_nt(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ out: paddle.Tensor) -> None:
|
||||
"""
|
||||
Do a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||
RHS and RHS scaling factors are required to be transposed.
|
||||
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||
- this function will do a transposing with a set of slow PyTorch operations.
|
||||
+ this function will do a transposing with a set of slow paddle operations.
|
||||
|
||||
Arguments:
|
||||
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
|
||||
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m, k]`,
|
||||
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
|
||||
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
|
||||
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[n, k]`.
|
||||
the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||
out: the BF16 output tensor of shape `[m, n]`, representing the result.
|
||||
"""
|
||||
@@ -189,22 +189,22 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
n, k_ = rhs.shape
|
||||
m_, n_ = out.shape
|
||||
|
||||
- assert n % 64 == 0 and k % 128 == 0
|
||||
+ # assert n % 64 == 0 and k % 128 == 0
|
||||
|
||||
# Type and shape checks
|
||||
- assert m == m_ and n == n_ and k == k_
|
||||
- assert n > 0 and k > 0
|
||||
- assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||
- assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
|
||||
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||
- assert out.dtype == torch.bfloat16
|
||||
- assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
|
||||
+ # assert m == m_ and n == n_ and k == k_
|
||||
+ # assert n > 0 and k > 0
|
||||
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||
+ # assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
|
||||
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||
+ # assert out.dtype == paddle.bfloat16
|
||||
+ # assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
|
||||
|
||||
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||
# NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
|
||||
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||
- assert rhs_scales.is_contiguous()
|
||||
+ # assert rhs_scales.is_contiguous()
|
||||
|
||||
# Do nothing if `m` is zero
|
||||
if m == 0:
|
||||
@@ -214,7 +214,7 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
global includes, template
|
||||
num_sms = get_num_sms()
|
||||
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms)
|
||||
- args = (lhs, lhs_scales, rhs, rhs_scales, out, m, torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||
+ args = (lhs, lhs_scales, rhs, rhs_scales, out, m, paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||
runtime = jit_tuner.compile_and_tune(
|
||||
name='gemm_fp8_fp8_bf16_nt',
|
||||
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||
@@ -225,10 +225,10 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
|
||||
space=(),
|
||||
includes=includes,
|
||||
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||
- ('out', torch.bfloat16), ('m', int),
|
||||
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||
+ ('out', paddle.bfloat16), ('m', int),
|
||||
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
template=template,
|
||||
args=args
|
||||
)
|
||||
diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||
index 3b518c9..ba776bd 100644
|
||||
--- a/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||
+++ b/deep_gemm/jit_kernels/m_grouped_gemm.py
|
||||
@@ -1,4 +1,4 @@
|
||||
-import torch
|
||||
+import paddle
|
||||
from typing import Tuple
|
||||
|
||||
from .gemm import get_best_configs, get_block_n_padding_for_smem_d
|
||||
@@ -37,25 +37,25 @@ gemm_t::run(out, rhs_scales, grouped_layout,
|
||||
"""
|
||||
|
||||
|
||||
-def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- out: torch.Tensor, m_indices: torch.Tensor) -> None:
|
||||
+def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ out: paddle.Tensor, m_indices: paddle.Tensor) -> None:
|
||||
"""
|
||||
Do a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||
RHS and RHS scaling factors are required to be transposed.
|
||||
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||
- this function will do a transposing with a set of slow PyTorch operations.
|
||||
+ this function will do a transposing with a set of slow Pypaddle operations.
|
||||
On the M axis, inputs are grouped into several batches, of which batch sizes aligned to
|
||||
`get_m_alignment_for_contiguous_layout()` (128).
|
||||
|
||||
Arguments:
|
||||
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`,
|
||||
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m_sum, k]`,
|
||||
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`.
|
||||
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||
out: the BF16 output tensor of shape `[m_sum, n]`, representing the result.
|
||||
- m_indices: a tensor of shape `[m_sum]` with type `torch.int`.
|
||||
+ m_indices: a tensor of shape `[m_sum]` with type `paddle.int`.
|
||||
`m_indices[i]` records the group which the i-th row of the LHS belong to,
|
||||
which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`.
|
||||
Values of `m_indices` in every-m-alignment-block must also be the same.
|
||||
@@ -68,19 +68,19 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||
m__ = m_indices.numel()
|
||||
|
||||
# Type and shape checks
|
||||
- assert m == m_ == m__ and k == k_ and n == n_
|
||||
- assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||
- assert out.dtype == torch.bfloat16
|
||||
- assert m_indices.dtype == torch.int32
|
||||
- assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||
- assert out.is_contiguous() and m_indices.is_contiguous()
|
||||
+ # assert m == m_ == m__ and k == k_ and n == n_
|
||||
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
|
||||
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||
+ # assert out.dtype == paddle.bfloat16
|
||||
+ # assert m_indices.dtype == paddle.int32
|
||||
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||
+ # assert out.is_contiguous() and m_indices.is_contiguous()
|
||||
|
||||
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||
- assert rhs_scales.is_contiguous()
|
||||
+ # assert rhs_scales.is_contiguous()
|
||||
|
||||
# Do nothing if `m` is zero
|
||||
if m == 0:
|
||||
@@ -92,7 +92,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms, is_grouped_contiguous=True)
|
||||
args = (lhs, lhs_scales, rhs, rhs_scales, out,
|
||||
m_indices, m, num_groups,
|
||||
- torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||
runtime = jit_tuner.compile_and_tune(
|
||||
name='m_grouped_gemm_fp8_fp8_bf16_nt',
|
||||
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||
@@ -105,11 +105,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||
'GEMM_TYPE': 'GroupedContiguous'},
|
||||
space=(),
|
||||
includes=includes,
|
||||
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||
- ('out', torch.bfloat16),
|
||||
- ('grouped_layout', torch.int32), ('m', int), ('num_groups', int),
|
||||
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||
+ ('out', paddle.bfloat16),
|
||||
+ ('grouped_layout', paddle.int32), ('m', int), ('num_groups', int),
|
||||
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
template=template,
|
||||
args=args
|
||||
)
|
||||
@@ -118,22 +118,22 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
|
||||
runtime(*args)
|
||||
|
||||
|
||||
-def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- rhs: Tuple[torch.Tensor, torch.Tensor],
|
||||
- out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None:
|
||||
+def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
|
||||
+ out: paddle.Tensor, masked_m: paddle.Tensor, expected_m: int) -> None:
|
||||
"""
|
||||
Do a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
|
||||
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
|
||||
RHS and RHS scaling factors are required to be transposed.
|
||||
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
|
||||
- this function will do a transposing with a set of slow PyTorch operations.
|
||||
+ this function will do a transposing with a set of slow paddle operations.
|
||||
Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch
|
||||
should be separately transposed.
|
||||
|
||||
Arguments:
|
||||
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
|
||||
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
|
||||
the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`.
|
||||
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
|
||||
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
|
||||
out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result.
|
||||
masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute
|
||||
@@ -149,21 +149,21 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||
num_groups___ = masked_m.numel()
|
||||
|
||||
# Type and shape checks
|
||||
- assert num_groups == num_groups_ == num_groups__ == num_groups___
|
||||
- assert m == m_ and n == n_ and k == k_
|
||||
- assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
|
||||
- assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
|
||||
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
|
||||
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
|
||||
- assert out.dtype == torch.bfloat16
|
||||
- assert masked_m.dtype == torch.int32
|
||||
- assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||
- assert out.is_contiguous() and masked_m.is_contiguous()
|
||||
+ # assert num_groups == num_groups_ == num_groups__ == num_groups___
|
||||
+ # assert m == m_ and n == n_ and k == k_
|
||||
+ # assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
|
||||
+ # assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
|
||||
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
|
||||
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
|
||||
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
|
||||
+ # assert out.dtype == paddle.bfloat16
|
||||
+ # assert masked_m.dtype == paddle.int32
|
||||
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
|
||||
+ # assert out.is_contiguous() and masked_m.is_contiguous()
|
||||
|
||||
# LHS scales must be transposed for TMA load, but not for RHS scales
|
||||
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
|
||||
- assert rhs_scales.is_contiguous()
|
||||
+ # assert rhs_scales.is_contiguous()
|
||||
|
||||
# Auto-tuning with compilation
|
||||
global includes, template
|
||||
@@ -176,7 +176,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||
|
||||
args = (lhs, lhs_scales, rhs, rhs_scales, out,
|
||||
masked_m, m,
|
||||
- torch.cuda.current_stream(), num_sms, smem_config[0])
|
||||
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
|
||||
runtime = jit_tuner.compile_and_tune(
|
||||
name='m_grouped_gemm_fp8_fp8_bf16_nt',
|
||||
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
|
||||
@@ -189,11 +189,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
|
||||
'GEMM_TYPE': 'GroupedMasked'},
|
||||
space=(),
|
||||
includes=includes,
|
||||
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
|
||||
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
|
||||
- ('out', torch.bfloat16),
|
||||
- ('grouped_layout', torch.int32), ('m', int),
|
||||
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
|
||||
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
|
||||
+ ('out', paddle.bfloat16),
|
||||
+ ('grouped_layout', paddle.int32), ('m', int),
|
||||
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
|
||||
template=template,
|
||||
args=args
|
||||
)
|
||||
diff --git a/deep_gemm/jit_kernels/tuner.py b/deep_gemm/jit_kernels/tuner.py
|
||||
index 6ed6749..9e1d70f 100644
|
||||
--- a/deep_gemm/jit_kernels/tuner.py
|
||||
+++ b/deep_gemm/jit_kernels/tuner.py
|
||||
@@ -1,6 +1,6 @@
|
||||
import copy
|
||||
import os
|
||||
-import torch
|
||||
+import paddle
|
||||
from typing import Any, Dict
|
||||
|
||||
from ..jit import build, cpp_format, generate, Runtime
|
||||
@@ -51,10 +51,10 @@ class JITTuner:
|
||||
continue
|
||||
|
||||
# Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
|
||||
- start_event = torch.cuda.Event(enable_timing=True)
|
||||
- end_event = torch.cuda.Event(enable_timing=True)
|
||||
- torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
|
||||
- torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||
+ start_event = paddle.device.cuda.Event(enable_timing=True)
|
||||
+ end_event = paddle.device.cuda.Event(enable_timing=True)
|
||||
+ paddle.empty((int(256e6 // 4)), dtype=paddle.int32).zero_()
|
||||
+ paddle.randn((8192, 8192), dtype=paddle.float32) @ paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||
start_event.record()
|
||||
for i in range(20):
|
||||
assert runtime(*args) == 0
|
||||
diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py
|
||||
index c6da56b..a17b1b1 100644
|
||||
--- a/deep_gemm/jit_kernels/utils.py
|
||||
+++ b/deep_gemm/jit_kernels/utils.py
|
||||
@@ -1,4 +1,4 @@
|
||||
-import torch
|
||||
+import paddle
|
||||
|
||||
_num_sms = None
|
||||
|
||||
@@ -11,7 +11,7 @@ def set_num_sms(num_sms: int) -> None:
|
||||
num_sms: the desired maximum SM count for all GEMM kernels to use.
|
||||
"""
|
||||
global _num_sms
|
||||
- assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count
|
||||
+ assert 0 < num_sms <= paddle.device.cuda.get_device_properties().multi_processor_count
|
||||
_num_sms = num_sms
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ def get_num_sms() -> int:
|
||||
"""
|
||||
global _num_sms
|
||||
if _num_sms is None:
|
||||
- _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
|
||||
+ _num_sms = paddle.device.cuda.get_device_properties().multi_processor_count
|
||||
return _num_sms
|
||||
|
||||
|
||||
@@ -74,9 +74,9 @@ def get_tma_aligned_size(x: int, element_size: int) -> int:
|
||||
return ceil_div(x, alignment) * alignment
|
||||
|
||||
|
||||
-def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
|
||||
+def get_col_major_tma_aligned_tensor(x: paddle.Tensor) -> paddle.Tensor:
|
||||
"""
|
||||
- Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary.
|
||||
+ Returns TMA-aligned transposed format of the input tensor. `paddle.transpose` will be called if necessary.
|
||||
If the input tensor is already column-major layout and 16-byte aligned along the M axis
|
||||
(thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing.
|
||||
|
||||
@@ -92,18 +92,20 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
|
||||
m, n = x.shape[-2], x.shape[-1]
|
||||
aligned_m = get_tma_aligned_size(m, x.element_size())
|
||||
if x.dim() == 2:
|
||||
- if x.stride(0) == 1 and x.stride(1) == aligned_m:
|
||||
+ if x.strides[0] == 1 and x.strides[1] == aligned_m:
|
||||
return x
|
||||
x, remove_dim = x.unsqueeze(0), True
|
||||
|
||||
b = x.shape[0]
|
||||
|
||||
# The last kernel gives a column-major TMA aligned layout
|
||||
- if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m:
|
||||
+ if x.strides[0] == aligned_m * n and x.strides[1] == 1 and x.strides[2] == aligned_m:
|
||||
return x.squeeze(0) if remove_dim else x
|
||||
|
||||
# Normal layout requires transposing
|
||||
- aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
|
||||
+ aligned_x = paddle.transpose(
|
||||
+ paddle.empty((b, n, aligned_m), dtype=x.dtype), perm=[0, 2, 1]
|
||||
+ )
|
||||
aligned_x[:, :m, :] = x
|
||||
aligned_x = aligned_x[:, :m, :]
|
||||
return aligned_x.squeeze(0) if remove_dim else aligned_x
|
||||
diff --git a/deep_gemm/paddle_utils.py b/deep_gemm/paddle_utils.py
|
||||
new file mode 100644
|
||||
index 0000000..2326807
|
||||
--- /dev/null
|
||||
+++ b/deep_gemm/paddle_utils.py
|
||||
@@ -0,0 +1,20 @@
|
||||
+import os
|
||||
+
|
||||
+def get_cuda_home():
|
||||
+ """Get Cuda home directory"""
|
||||
+ cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
|
||||
+ if cuda_home:
|
||||
+ return cuda_home
|
||||
+
|
||||
+ try:
|
||||
+ which_cmd = "which nvcc"
|
||||
+
|
||||
+ nvcc_path = os.popen(which_cmd).read().strip()
|
||||
+ if nvcc_path:
|
||||
+ return os.path.dirname(os.path.dirname(nvcc_path))
|
||||
+ except Exception:
|
||||
+ pass
|
||||
+
|
||||
+ return None
|
||||
+
|
||||
+CUDA_HOME = get_cuda_home()
|
||||
\ No newline at end of file
|
||||
diff --git a/deep_gemm/utils.py b/deep_gemm/utils.py
|
||||
index d5cdd01..5237f09 100644
|
||||
--- a/deep_gemm/utils.py
|
||||
+++ b/deep_gemm/utils.py
|
||||
@@ -1,15 +1,15 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
-import torch
|
||||
-import torch.distributed as dist
|
||||
+import paddle
|
||||
+import paddle.distributed as dist
|
||||
|
||||
|
||||
def bench(fn, num_warmups: int = 5, num_tests: int = 10,
|
||||
high_precision: bool = False):
|
||||
# Flush L2 cache with 256 MB data
|
||||
- torch.cuda.synchronize()
|
||||
- cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
|
||||
+ paddle.device.cuda.synchronize()
|
||||
+ cache = paddle.empty((int(256e6 // 4)), dtype=paddle.int32)
|
||||
cache.zero_()
|
||||
|
||||
# Warmup
|
||||
@@ -18,18 +18,18 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
|
||||
|
||||
# Add a large kernel to eliminate the CPU launch overhead
|
||||
if high_precision:
|
||||
- x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||
- y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||
+ x = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||
+ y = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||
x @ y
|
||||
|
||||
# Testing
|
||||
- start_event = torch.cuda.Event(enable_timing=True)
|
||||
- end_event = torch.cuda.Event(enable_timing=True)
|
||||
+ start_event = paddle.device.cuda.Event(enable_timing=True)
|
||||
+ end_event = paddle.device.cuda.Event(enable_timing=True)
|
||||
start_event.record()
|
||||
for i in range(num_tests):
|
||||
fn()
|
||||
end_event.record()
|
||||
- torch.cuda.synchronize()
|
||||
+ paddle.device.synchronize()
|
||||
|
||||
return start_event.elapsed_time(end_event) / num_tests
|
||||
|
||||
@@ -106,21 +106,21 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
|
||||
# Profile
|
||||
suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
|
||||
with suppress():
|
||||
- schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
|
||||
- profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
|
||||
+ scheduler = paddle.profiler.make_scheduler(closed=0, ready=1, record=1, repeat=1) if not using_nsys else None
|
||||
+ profiler = paddle.profiler.Profiler(targets=[paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU], scheduler=scheduler) if not using_nsys else empty_suppress()
|
||||
with profiler:
|
||||
for i in range(2):
|
||||
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
|
||||
if barrier_comm_profiling:
|
||||
- lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||
- rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
|
||||
+ lhs = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||
+ rhs = paddle.randn((8192, 8192), dtype=paddle.float32)
|
||||
lhs @ rhs
|
||||
- dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
|
||||
+ dist.all_reduce(paddle.ones(1, dtype=paddle.float32))
|
||||
for _ in range(num_tests):
|
||||
if sleep_between_tests > 0.0:
|
||||
time.sleep(sleep_between_tests)
|
||||
if flush_l2:
|
||||
- torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
|
||||
+ paddle.empty(flush_l2_size, dtype=paddle.int32).zero_()
|
||||
fn()
|
||||
|
||||
if not using_nsys:
|
||||
--
|
||||
2.43.0
|
||||
|
@@ -1,188 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "dtype.h"
|
||||
#include "matmul_helper.h"
|
||||
#include "my_types.h"
|
||||
#include "paddle/extension.h"
|
||||
#include "paddle/phi/core/kernel_registry.h"
|
||||
template <typename T>
|
||||
void AvxCompute(const paddle::Tensor &x,
|
||||
const paddle::Tensor &weight,
|
||||
const paddle::Tensor &w_bias,
|
||||
bool trans,
|
||||
const std::string alog,
|
||||
paddle::Tensor &out,
|
||||
xft::Matrix<T> &quantizedWeight,
|
||||
xft::Vector<float> &WeightScale,
|
||||
xft::Vector<float> &WeightZero,
|
||||
xft::Vector<float> &WeightSum,
|
||||
MMHelper *mmHelper) {
|
||||
auto out_data = out.data<float>();
|
||||
const float *x_data = reinterpret_cast<const float *>(x.data<float>());
|
||||
const float *bias_data = nullptr;
|
||||
if (w_bias.initialized()) {
|
||||
bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
|
||||
}
|
||||
int m = 1;
|
||||
for (int i = 0; i < x.shape().size() - 1; i++) {
|
||||
m = m * x.shape()[i];
|
||||
}
|
||||
int k = x.shape()[x.shape().size() - 1];
|
||||
int l = weight.shape()[1];
|
||||
int n = weight.shape()[1];
|
||||
if (w_bias.initialized()) {
|
||||
mmHelper->compute_bias(false,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
1.0f,
|
||||
x_data,
|
||||
k,
|
||||
quantizedWeight.Data(),
|
||||
WeightScale.Data(),
|
||||
WeightZero.Data(),
|
||||
WeightSum.Data(),
|
||||
0.0f,
|
||||
out_data,
|
||||
l,
|
||||
bias_data);
|
||||
} else {
|
||||
mmHelper->compute(false,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
1.0f,
|
||||
x_data,
|
||||
k,
|
||||
quantizedWeight.Data(),
|
||||
WeightScale.Data(),
|
||||
WeightZero.Data(),
|
||||
WeightSum.Data(),
|
||||
0.0,
|
||||
out_data,
|
||||
l);
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
void AvxWeightOnly(const paddle::Tensor &x,
|
||||
const paddle::Tensor &weight,
|
||||
const paddle::Tensor &w_bias,
|
||||
bool trans,
|
||||
const std::string alog,
|
||||
paddle::Tensor &out) {
|
||||
static std::unordered_map<std::string,
|
||||
std::tuple<xft::Matrix<T> *,
|
||||
xft::Vector<float> *,
|
||||
xft::Vector<float> *,
|
||||
xft::Vector<float> *>>
|
||||
weight_only_hub;
|
||||
std::stringstream weights_addr;
|
||||
weights_addr << weight.data<float>() << alog;
|
||||
std::string weight_only_key = weights_addr.str();
|
||||
auto it_created = weight_only_hub.find(weight_only_key);
|
||||
static MMHelper *mmHelper;
|
||||
int rows = weight.shape()[0], cols = weight.shape()[1];
|
||||
xft::Vector<float> *WeightScale =
|
||||
new xft::Vector<float>(); // if weight is int8
|
||||
xft::Vector<float> *WeightZero =
|
||||
new xft::Vector<float>(); // if weight is int8
|
||||
xft::Vector<float> *WeightSum =
|
||||
new xft::Vector<float>(); // if weight is int8
|
||||
xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
|
||||
if (it_created == weight_only_hub.end()) {
|
||||
auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
|
||||
xft::Matrix<T> convertedWeight;
|
||||
mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
|
||||
mmHelper->convertWeight(trans,
|
||||
rows,
|
||||
cols,
|
||||
weight_ptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
convertedWeight,
|
||||
*WeightScale,
|
||||
*WeightZero,
|
||||
*WeightSum);
|
||||
quantizedWeight->Resize(rows, cols);
|
||||
mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
|
||||
weight_only_hub[weight_only_key] = std::make_tuple(
|
||||
quantizedWeight, WeightScale, WeightZero, WeightSum);
|
||||
AvxCompute<T>(x,
|
||||
weight,
|
||||
w_bias,
|
||||
trans,
|
||||
alog,
|
||||
out,
|
||||
*quantizedWeight,
|
||||
*WeightScale,
|
||||
*WeightZero,
|
||||
*WeightSum,
|
||||
mmHelper);
|
||||
} else {
|
||||
AvxCompute<T>(x,
|
||||
weight,
|
||||
w_bias,
|
||||
trans,
|
||||
alog,
|
||||
out,
|
||||
*(std::get<0>(it_created->second)),
|
||||
*(std::get<1>(it_created->second)),
|
||||
*(std::get<2>(it_created->second)),
|
||||
*(std::get<3>(it_created->second)),
|
||||
mmHelper);
|
||||
}
|
||||
}
|
||||
std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
|
||||
const paddle::Tensor &weight,
|
||||
const paddle::Tensor &w_bias,
|
||||
const std::string &alog,
|
||||
bool trans) {
|
||||
auto out_shape = x.shape();
|
||||
out_shape[out_shape.size() - 1] = weight.shape()[1];
|
||||
auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
|
||||
if (alog == "int8") {
|
||||
AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
|
||||
} else if (alog == "fp16") {
|
||||
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
|
||||
} else {
|
||||
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
|
||||
std::vector<int64_t> x_shape,
|
||||
std::vector<int64_t> weigh_shape,
|
||||
std::vector<int64_t> weigh_bias_shape) {
|
||||
int m = 1;
|
||||
for (int i = 0; i < x_shape.size() - 1; i++) {
|
||||
m = m * x_shape[i];
|
||||
}
|
||||
return {std::vector<int64_t>{m, weigh_shape[1]}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
|
||||
paddle::DataType x_dtype,
|
||||
paddle::DataType weight_dtype,
|
||||
paddle::DataType weight_bias_dtype) {
|
||||
return {x_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(avx_weight_only)
|
||||
.Inputs({"x", "weight", "w_bias"})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"alog: std::string", "trans:bool"})
|
||||
.SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
|
268
custom_ops/cpu_ops/rebuild_padding.cc
Normal file
268
custom_ops/cpu_ops/rebuild_padding.cc
Normal file
@@ -0,0 +1,268 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <vector>
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#ifndef PD_BUILD_STATIC_OP
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void RebuildPaddingCPUImpl(T *output_data,
|
||||
const T *input_data,
|
||||
const int *cum_offsets_data,
|
||||
const int *seq_len_this_time_data,
|
||||
const int *seq_lens_decoder_data,
|
||||
const int *seq_lens_encoder_data,
|
||||
int max_input_length,
|
||||
int dim_embed,
|
||||
const int elem_nums) {
|
||||
for (int i = 0; i < elem_nums; ++i) {
|
||||
const int bi = i / dim_embed;
|
||||
const int bias_idx = i % dim_embed;
|
||||
int seq_id = 0;
|
||||
|
||||
if (seq_len_this_time_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
const int ori_token_idx =
|
||||
bi * max_input_length - cum_offsets_data[bi] + seq_id;
|
||||
const int src_offset = ori_token_idx * dim_embed + bias_idx;
|
||||
|
||||
output_data[i] = input_data[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RebuildAppendPaddingCPUImpl(T *output_data,
|
||||
const T *input_data,
|
||||
const int *cum_offsets_data,
|
||||
const int *seq_len_this_time_data,
|
||||
const int *seq_lens_decoder_data,
|
||||
const int *seq_lens_encoder_data,
|
||||
const int *output_padding_offset_data,
|
||||
const int max_input_length,
|
||||
const int dim_embed,
|
||||
const int64_t output_elem_nums) {
|
||||
for (int i = 0; i < output_elem_nums; ++i) {
|
||||
int out_token_id = i / dim_embed;
|
||||
int ori_token_id =
|
||||
out_token_id + output_padding_offset_data[out_token_id];
|
||||
int bi = ori_token_id / max_input_length;
|
||||
if (seq_len_this_time_data[bi] == 0 ||
|
||||
(seq_lens_decoder_data[bi] == 0 &&
|
||||
seq_lens_encoder_data[bi] == 0)) {
|
||||
continue;
|
||||
}
|
||||
int seq_id = 0;
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
|
||||
int bias_idx = i % dim_embed;
|
||||
int src_offset = input_token_id * dim_embed + bias_idx;
|
||||
output_data[i] = input_data[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> RebuildPaddingCPU(
|
||||
const paddle::Tensor &tmp_out,
|
||||
const paddle::Tensor &cum_offsets,
|
||||
const paddle::Tensor &seq_len_this_time,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::optional<paddle::Tensor> &output_padding_offset,
|
||||
int max_input_length) {
|
||||
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
|
||||
auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_len_this_time_cpu =
|
||||
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_decoder_cpu =
|
||||
seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_encoder_cpu =
|
||||
seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
|
||||
paddle::optional<paddle::Tensor> output_padding_offset_cpu;
|
||||
if (output_padding_offset) {
|
||||
output_padding_offset_cpu =
|
||||
output_padding_offset->copy_to(paddle::CPUPlace(), true);
|
||||
}
|
||||
|
||||
int token_num = tmp_out_cpu.shape()[0];
|
||||
int dim_embed = tmp_out_cpu.shape()[1];
|
||||
int bsz = cum_offsets_cpu.shape()[0];
|
||||
|
||||
paddle::Tensor out;
|
||||
if (output_padding_offset_cpu) {
|
||||
int need_delete_token_num = 0;
|
||||
for (int i = 0; i < bsz; ++i) {
|
||||
if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
|
||||
need_delete_token_num +=
|
||||
seq_lens_encoder_cpu.data<int>()[i] - 1;
|
||||
}
|
||||
}
|
||||
int output_token_num = token_num - need_delete_token_num;
|
||||
out = paddle::full({output_token_num, dim_embed},
|
||||
0,
|
||||
tmp_out_cpu.dtype(),
|
||||
paddle::CPUPlace());
|
||||
} else {
|
||||
out = paddle::full(
|
||||
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
|
||||
}
|
||||
|
||||
const int *cum_offsets_data = cum_offsets_cpu.data<int>();
|
||||
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
|
||||
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
|
||||
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
|
||||
int elem_nums = out.numel();
|
||||
|
||||
if (output_padding_offset_cpu) {
|
||||
const int *output_padding_offset_data =
|
||||
output_padding_offset_cpu->data<int>();
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
} else {
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
|
||||
RebuildPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cum_offsets_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||
const std::vector<int64_t> &tmp_out_shape,
|
||||
const std::vector<int64_t> &cum_offsets_shape,
|
||||
const std::vector<int64_t> &seq_len_this_time_shape,
|
||||
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||
const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
|
||||
int64_t dim_embed = tmp_out_shape[1];
|
||||
if (output_padding_offset_shape) {
|
||||
return {{-1, dim_embed}};
|
||||
} else {
|
||||
int64_t bsz = cum_offsets_shape[0];
|
||||
return {{bsz, dim_embed}};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||
const paddle::DataType &tmp_out_dtype,
|
||||
const paddle::DataType &cum_offsets_dtype,
|
||||
const paddle::DataType &seq_len_this_time_dtype,
|
||||
const paddle::DataType &seq_lens_decoder_dtype,
|
||||
const paddle::DataType &seq_lens_encoder_dtype,
|
||||
const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
|
||||
return {tmp_out_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(rebuild_padding_cpu)
|
||||
.Inputs({"tmp_out",
|
||||
"cum_offsets",
|
||||
"seq_len_this_time",
|
||||
"seq_lens_decoder",
|
||||
"seq_lens_encoder",
|
||||
paddle::Optional("output_padding_offset")})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"max_input_length: int"})
|
||||
.SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));
|
@@ -1,201 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "layers_decoder.h"
|
||||
#include "paddle/extension.h"
|
||||
#include "paddle/phi/core/kernel_registry.h"
|
||||
|
||||
std::vector<paddle::Tensor> InvokeAllLLaMALayer(
|
||||
const paddle::Tensor &input,
|
||||
const std::vector<paddle::Tensor> &ln1Gamma,
|
||||
const std::vector<paddle::Tensor> &ln1Beta,
|
||||
const std::vector<paddle::Tensor> &qkvWeight,
|
||||
const std::vector<paddle::Tensor> &qkvBiasWeight,
|
||||
const std::vector<paddle::Tensor> &attnOutWeight,
|
||||
const std::vector<paddle::Tensor> &attnOutBias,
|
||||
const std::vector<paddle::Tensor> &ln2Gamma,
|
||||
const std::vector<paddle::Tensor> &ln2Beta,
|
||||
const std::vector<paddle::Tensor> &gateWeight,
|
||||
const std::vector<paddle::Tensor> &gateBias,
|
||||
const std::vector<paddle::Tensor> &upWeight,
|
||||
const std::vector<paddle::Tensor> &upBias,
|
||||
const std::vector<paddle::Tensor> &downWeight,
|
||||
const std::vector<paddle::Tensor> &downBias,
|
||||
const paddle::Tensor &pastSeqLen,
|
||||
const paddle::Tensor ¤tSeqLen,
|
||||
const paddle::Tensor &step,
|
||||
int hiddensize,
|
||||
int totalLayer,
|
||||
const std::string &computeType,
|
||||
const std::string &activation,
|
||||
const std::string &normType,
|
||||
int attHeadDim,
|
||||
int attHeadNum,
|
||||
int kvHeadNum,
|
||||
int maxPositions,
|
||||
int maxPosEmbed,
|
||||
int intermediateSize) {
|
||||
auto out = paddle::empty_like(input);
|
||||
auto batchSize = input.shape()[0];
|
||||
auto inputSeqLen = input.shape()[1];
|
||||
auto past_seq_len = pastSeqLen.data<int64_t>()[0];
|
||||
auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
|
||||
auto step_id = step.data<int64_t>()[0];
|
||||
auto output_ptr = reinterpret_cast<void *>(out.data<float>());
|
||||
auto xft_data_type = xft::DataType::fp16;
|
||||
if (computeType == "bf16") {
|
||||
xft_data_type = xft::DataType::bf16;
|
||||
} else if (computeType == "bf16_int8") {
|
||||
xft_data_type = xft::DataType::bf16_int8;
|
||||
}
|
||||
auto xft_act_type = xft::ActivationType::SILU;
|
||||
if (activation == "relu") {
|
||||
xft_act_type = xft::ActivationType::RELU;
|
||||
} else if (activation == "gelu") {
|
||||
xft_act_type = xft::ActivationType::GELU;
|
||||
} else if (activation == "swiglu") {
|
||||
xft_act_type = xft::ActivationType::SWIGLU;
|
||||
}
|
||||
auto xft_norm_type = xft::NormType::RMS;
|
||||
if (normType == "layernorm") {
|
||||
xft_norm_type = xft::NormType::LN;
|
||||
}
|
||||
auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
|
||||
for (int i = 0; i < totalLayer; ++i) {
|
||||
auto ln1Gamma_ptr =
|
||||
reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
|
||||
auto ln1Beta_ptr =
|
||||
reinterpret_cast<const float *>(ln1Beta[i].data<float>());
|
||||
auto qkvWeight_ptr =
|
||||
reinterpret_cast<const void *>(qkvWeight[i].data<float>());
|
||||
auto qkvBiasWeight_ptr =
|
||||
reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
|
||||
auto attnOutWeight_ptr =
|
||||
reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
|
||||
auto ln2Gamma_ptr =
|
||||
reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
|
||||
auto ln2Beta_ptr =
|
||||
reinterpret_cast<const float *>(ln2Beta[i].data<float>());
|
||||
auto gate_weight_ptr =
|
||||
reinterpret_cast<const void *>(gateWeight[i].data<float>());
|
||||
auto up_weight_ptr =
|
||||
reinterpret_cast<const void *>(upWeight[i].data<float>());
|
||||
auto down_weight_ptr =
|
||||
reinterpret_cast<const void *>(downWeight[i].data<float>());
|
||||
auto gate_bias_ptr =
|
||||
reinterpret_cast<const float *>(gateBias[i].data<float>());
|
||||
auto up_bias_ptr =
|
||||
reinterpret_cast<const float *>(upBias[i].data<float>());
|
||||
auto down_bias_ptr =
|
||||
reinterpret_cast<const float *>(downBias[i].data<float>());
|
||||
auto attnOutBias_ptr =
|
||||
reinterpret_cast<const float *>(attnOutBias[i].data<float>());
|
||||
invokeLayerLLaMA(
|
||||
xft_data_type, // dt
|
||||
xft_act_type, // at
|
||||
xft_norm_type, // nt
|
||||
i, // layerId
|
||||
totalLayer, // totalLayers
|
||||
batchSize, // batchSize
|
||||
inputSeqLen, // inputSeqLen
|
||||
attHeadDim, // attHeadDim
|
||||
attHeadNum, // attHeadNum
|
||||
kvHeadNum, // kvHeadNum
|
||||
maxPositions, // maxPositions
|
||||
maxPosEmbed, // maxPosEmbed
|
||||
past_seq_len, // pastSeqLen
|
||||
cur_seq_len, // currentSeqLen
|
||||
step_id, // step
|
||||
hiddensize, // hiddenSize
|
||||
intermediateSize, // intermediateSize
|
||||
reinterpret_cast<void *>(output_ptr), // output
|
||||
hiddensize, // outputStride
|
||||
input_ptr, // input
|
||||
hiddensize, // inputStride
|
||||
ln1Gamma_ptr, // ln1Gamma
|
||||
ln1Beta_ptr, // ln1Beta
|
||||
qkvWeight_ptr, // queryWeight
|
||||
qkvWeight_ptr + hiddensize, // keyWeight
|
||||
qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim, // valueWeight
|
||||
attnOutWeight_ptr, // attnOutWeight
|
||||
ln2Gamma_ptr, // ln2Gamma
|
||||
ln2Beta_ptr, // ln2Beta
|
||||
gate_weight_ptr,
|
||||
up_weight_ptr,
|
||||
down_weight_ptr,
|
||||
qkvBiasWeight_ptr, // queryBias
|
||||
qkvBiasWeight_ptr + hiddensize, // keyBias
|
||||
qkvBiasWeight_ptr + hiddensize +
|
||||
kvHeadNum * attHeadDim, // valueBias
|
||||
attnOutBias_ptr, // attnOutBias
|
||||
qkvWeight_ptr, // myqkvWeight
|
||||
gate_bias_ptr,
|
||||
up_bias_ptr,
|
||||
down_bias_ptr,
|
||||
qkvBiasWeight_ptr);
|
||||
if (i < totalLayer - 1) {
|
||||
memcpy(const_cast<void *>(input_ptr),
|
||||
output_ptr,
|
||||
batchSize * inputSeqLen * hiddensize * sizeof(float));
|
||||
}
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
|
||||
std::vector<int64_t> x_shape) {
|
||||
return {x_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> AllLLaMALayerInferDtype(
|
||||
paddle::DataType x_dtype) {
|
||||
return {x_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(xft_llama_all_layer)
|
||||
.Inputs({
|
||||
"x",
|
||||
paddle::Vec("ln1Gamma"),
|
||||
paddle::Vec("ln1Beta"),
|
||||
paddle::Vec("qkvWeight"),
|
||||
paddle::Vec("qkvBiasWeight"),
|
||||
paddle::Vec("attnOutWeight"),
|
||||
paddle::Vec("attnOutBias"),
|
||||
paddle::Vec("ln2Gamma"),
|
||||
paddle::Vec("ln2Beta"),
|
||||
paddle::Vec("gateWeight"),
|
||||
paddle::Vec("gateBias"),
|
||||
paddle::Vec("upWeight"),
|
||||
paddle::Vec("upBias"),
|
||||
paddle::Vec("downWeight"),
|
||||
paddle::Vec("downBias"),
|
||||
"pastSeqLen",
|
||||
"currentSeqLen",
|
||||
"step",
|
||||
})
|
||||
.Outputs({"out"})
|
||||
.Attrs({"hiddensize :int",
|
||||
"totalLayer :int",
|
||||
"computeType : std::string",
|
||||
"activation :std::string",
|
||||
"normType :std::string",
|
||||
"attHeadDim: int",
|
||||
"attHeadNum: int",
|
||||
"kvHeadNum: int",
|
||||
"maxPositions: int",
|
||||
"maxPosEmbed: int",
|
||||
"intermediateSize: int"})
|
||||
.SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));
|
@@ -1,126 +0,0 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include <omp.h>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include "paddle/extension.h"
|
||||
|
||||
void greedy_search(const float *probs,
|
||||
int64_t *next_token_ids,
|
||||
int bsz,
|
||||
int vocab_size) {
|
||||
int numThreads = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
if (tid == 0) {
|
||||
numThreads = omp_get_num_threads();
|
||||
}
|
||||
}
|
||||
float maxVals[bsz];
|
||||
|
||||
// Small batch size (each sample can have at least 2 threads)
|
||||
if (numThreads / bsz >= 2) {
|
||||
int thrPerSample = numThreads / bsz;
|
||||
int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
|
||||
int maxIndices[bsz * thrPerSample];
|
||||
float maxValues[bsz * thrPerSample];
|
||||
|
||||
// TODO: if size is small, possible to cause out of boundary
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int b = 0; b < bsz; ++b) {
|
||||
for (int t = 0; t < thrPerSample; ++t) {
|
||||
int start = t * sizePerThr;
|
||||
int end = (start + sizePerThr) > vocab_size
|
||||
? vocab_size
|
||||
: (start + sizePerThr);
|
||||
const float *p = probs + b * vocab_size;
|
||||
int maxIdx = start;
|
||||
float maxVal = p[start];
|
||||
for (int off = start + 1; off < end; ++off) {
|
||||
if (p[off] > maxVal) {
|
||||
maxVal = p[off];
|
||||
maxIdx = off;
|
||||
}
|
||||
}
|
||||
|
||||
// False sharing happens, but since only one time, not avoided
|
||||
maxIndices[b * thrPerSample + t] = maxIdx;
|
||||
maxValues[b * thrPerSample + t] = maxVal;
|
||||
}
|
||||
}
|
||||
|
||||
// Local reduction
|
||||
for (int i = 0; i < bsz; ++i) {
|
||||
int *pIndices = maxIndices + i * thrPerSample;
|
||||
float *pValues = maxValues + i * thrPerSample;
|
||||
int maxIdx = pIndices[0];
|
||||
float maxVal = pValues[0];
|
||||
for (int j = 1; j < thrPerSample; ++j) {
|
||||
if (pValues[j] > maxVal) {
|
||||
maxVal = pValues[j];
|
||||
maxIdx = pIndices[j];
|
||||
}
|
||||
}
|
||||
next_token_ids[i] = maxIdx;
|
||||
maxVals[i] = maxVal;
|
||||
}
|
||||
}
|
||||
|
||||
// Each thread handle one sample (one row)
|
||||
else {
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < bsz; ++i) {
|
||||
int maxId = 0;
|
||||
const float *p = probs + i * vocab_size;
|
||||
float maxVal = p[0];
|
||||
for (int j = 1; j < vocab_size; ++j) {
|
||||
if (p[j] > maxVal) {
|
||||
maxVal = p[j];
|
||||
maxId = j;
|
||||
}
|
||||
}
|
||||
next_token_ids[i] = maxId;
|
||||
maxVals[i] = maxVal;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto next_tokens =
|
||||
paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
|
||||
|
||||
greedy_search(probs.data<float>(),
|
||||
const_cast<int64_t *>(next_tokens.data<int64_t>()),
|
||||
bsz,
|
||||
vocab_size);
|
||||
return {next_tokens};
|
||||
}
|
||||
std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
|
||||
const std::vector<int64_t> &probs_shape) {
|
||||
int64_t bsz = probs_shape[0];
|
||||
return {{bsz, 1}};
|
||||
}
|
||||
std::vector<paddle::DataType> XftGreedySearchInferDtype(
|
||||
const paddle::DataType &probs_dtype) {
|
||||
return {paddle::DataType::INT64};
|
||||
}
|
||||
PD_BUILD_STATIC_OP(xft_greedy_search)
|
||||
.Inputs({"probs"})
|
||||
.Outputs({"next_tokens_ids"})
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
|
||||
.SetKernelFn(PD_KERNEL(XftGreedySearch));
|
File diff suppressed because it is too large
Load Diff
@@ -17,15 +17,12 @@
|
||||
#include "paddle/phi/core/memory/memcpy.h"
|
||||
|
||||
template <int THREADBLOCK_SIZE>
|
||||
__global__ void GetMaxLenKernel(const int *seq_lens,
|
||||
const int *seq_lens_this_time,
|
||||
const int *seq_lens_encoder,
|
||||
const int *seq_lens_this_time_merged,
|
||||
const int *seq_lens_encoder_merged,
|
||||
const int *seq_mapping,
|
||||
const int *system_lens,
|
||||
int *max_lens,
|
||||
const int batch_size) {
|
||||
__global__ void
|
||||
GetMaxLenKernel(const int *seq_lens, const int *seq_lens_this_time,
|
||||
const int *seq_lens_encoder,
|
||||
const int *seq_lens_this_time_merged,
|
||||
const int *seq_lens_encoder_merged, const int *seq_mapping,
|
||||
const int *system_lens, int *max_lens, const int batch_size) {
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
||||
@@ -41,43 +38,61 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
|
||||
int max_dec_len_without_system_this_thread = 0;
|
||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||
const int seq_len_this_time = seq_lens_this_time[i];
|
||||
max_len_this_time_this_thread = max(seq_len_this_time,
|
||||
max_len_this_time_this_thread);
|
||||
max_len_encoder_this_thread = max(seq_lens_encoder[i],
|
||||
max_len_encoder_this_thread);
|
||||
max_len_this_time_this_thread =
|
||||
max(seq_len_this_time, max_len_this_time_this_thread);
|
||||
max_len_encoder_this_thread =
|
||||
max(seq_lens_encoder[i], max_len_encoder_this_thread);
|
||||
max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
|
||||
if (seq_len_this_time <= 0) continue;
|
||||
if (seq_len_this_time <= 0)
|
||||
continue;
|
||||
const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
|
||||
max_len_this_thread = max(seq_lens[i] + seq_len_this_time,
|
||||
max_len_this_thread);
|
||||
max_just_dec_len_this_thread = max(max_just_dec_len_this_thread,
|
||||
max_just_dec_len_now);
|
||||
max_len_this_thread =
|
||||
max(seq_lens[i] + seq_len_this_time, max_len_this_thread);
|
||||
max_just_dec_len_this_thread =
|
||||
max(max_just_dec_len_this_thread, max_just_dec_len_now);
|
||||
if (system_lens) {
|
||||
const int real_bid = seq_mapping[i];
|
||||
const int system_len_now = system_lens[real_bid];
|
||||
max_system_len_this_thread = max(max_system_len_this_thread, system_len_now);
|
||||
max_dec_len_without_system_this_thread = max(max_dec_len_without_system_this_thread,
|
||||
max_just_dec_len_now - system_len_now);
|
||||
max_system_len_this_thread =
|
||||
max(max_system_len_this_thread, system_len_now);
|
||||
max_dec_len_without_system_this_thread =
|
||||
max(max_dec_len_without_system_this_thread,
|
||||
max_just_dec_len_now - system_len_now);
|
||||
}
|
||||
}
|
||||
if (system_lens) {
|
||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||
const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
|
||||
if (ori_seq_len_this_time <= 0) continue;
|
||||
const int max_just_dec_merged_len_this_time_now = seq_lens_encoder_merged[i] > 0 ?
|
||||
0 : ori_seq_len_this_time;
|
||||
max_just_dec_merged_len_this_time_this_thread = max(max_just_dec_merged_len_this_time_this_thread,
|
||||
max_just_dec_merged_len_this_time_now);
|
||||
if (ori_seq_len_this_time <= 0)
|
||||
continue;
|
||||
const int max_just_dec_merged_len_this_time_now =
|
||||
seq_lens_encoder_merged[i] > 0 ? 0 : ori_seq_len_this_time;
|
||||
max_just_dec_merged_len_this_time_this_thread =
|
||||
max(max_just_dec_merged_len_this_time_this_thread,
|
||||
max_just_dec_merged_len_this_time_now);
|
||||
}
|
||||
}
|
||||
int total_max_len_this_time = BlockReduce(temp_storage).Reduce(max_len_this_time_this_thread, MaxOp<int>());
|
||||
int total_max_len_encoder = BlockReduce(temp_storage).Reduce(max_len_encoder_this_thread, MaxOp<int>());
|
||||
int total_max_len_decoder = BlockReduce(temp_storage).Reduce(max_len_decoder_this_thread, MaxOp<int>());
|
||||
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||
int total_just_dec = BlockReduce(temp_storage).Reduce(max_just_dec_len_this_thread, MaxOp<int>());
|
||||
int total_just_dec_merged = BlockReduce(temp_storage).Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
|
||||
int total_system_len = BlockReduce(temp_storage).Reduce(max_system_len_this_thread, MaxOp<int>());
|
||||
int total_dec_len_without_system = BlockReduce(temp_storage).Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
|
||||
int total_max_len_this_time =
|
||||
BlockReduce(temp_storage)
|
||||
.Reduce(max_len_this_time_this_thread, MaxOp<int>());
|
||||
int total_max_len_encoder =
|
||||
BlockReduce(temp_storage)
|
||||
.Reduce(max_len_encoder_this_thread, MaxOp<int>());
|
||||
int total_max_len_decoder =
|
||||
BlockReduce(temp_storage)
|
||||
.Reduce(max_len_decoder_this_thread, MaxOp<int>());
|
||||
int total =
|
||||
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||
int total_just_dec = BlockReduce(temp_storage)
|
||||
.Reduce(max_just_dec_len_this_thread, MaxOp<int>());
|
||||
int total_just_dec_merged =
|
||||
BlockReduce(temp_storage)
|
||||
.Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
|
||||
int total_system_len = BlockReduce(temp_storage)
|
||||
.Reduce(max_system_len_this_thread, MaxOp<int>());
|
||||
int total_dec_len_without_system =
|
||||
BlockReduce(temp_storage)
|
||||
.Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
|
||||
if (tid == 0) {
|
||||
max_lens[0] = total_max_len_this_time;
|
||||
max_lens[1] = total_max_len_encoder;
|
||||
@@ -90,30 +105,22 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
|
||||
}
|
||||
}
|
||||
|
||||
void GetMaxLen(const paddle::Tensor& seq_lens_tensor,
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
paddle::Tensor &max_len_tensor,
|
||||
const int batch_size) {
|
||||
void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
|
||||
const paddle::Tensor &seq_lens_this_time,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
paddle::Tensor &max_len_tensor, const int batch_size) {
|
||||
constexpr int blockSize = 1024;
|
||||
GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
|
||||
seq_lens_tensor.data<int>(),
|
||||
seq_lens_this_time.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
max_len_tensor.data<int>(),
|
||||
batch_size);
|
||||
seq_lens_tensor.data<int>(), seq_lens_this_time.data<int>(),
|
||||
seq_lens_encoder.data<int>(), nullptr, nullptr, nullptr, nullptr,
|
||||
max_len_tensor.data<int>(), batch_size);
|
||||
}
|
||||
|
||||
__global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
||||
const int* __restrict__ seq_lens_encoder,
|
||||
int* __restrict__ batch_ids,
|
||||
int* __restrict__ tile_ids_per_batch,
|
||||
int* __restrict__ num_blocks_x,
|
||||
const int bsz,
|
||||
__global__ void split_q_block(const int *__restrict__ seq_lens_q,
|
||||
const int *__restrict__ seq_lens_encoder,
|
||||
int *__restrict__ batch_ids,
|
||||
int *__restrict__ tile_ids_per_batch,
|
||||
int *__restrict__ num_blocks_x, const int bsz,
|
||||
const int num_rows_per_block,
|
||||
const int group_size) {
|
||||
if (threadIdx.x == 0) {
|
||||
@@ -124,8 +131,7 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
||||
if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
|
||||
seq_len = 0;
|
||||
}
|
||||
const int loop_times =
|
||||
div_up(seq_len * group_size, num_rows_per_block);
|
||||
const int loop_times = div_up(seq_len * group_size, num_rows_per_block);
|
||||
for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
|
||||
batch_ids[index] = bid;
|
||||
tile_ids_per_batch[index++] = tile_id;
|
||||
@@ -136,14 +142,12 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
|
||||
const int* __restrict__ seq_lens_encoder,
|
||||
int* __restrict__ batch_ids,
|
||||
int* __restrict__ tile_ids_per_batch,
|
||||
int* __restrict__ num_blocks_x,
|
||||
const int bsz,
|
||||
const int pad_len,
|
||||
const int num_row_per_block) {
|
||||
__global__ void split_kv_block(const int *__restrict__ seq_lens_decoder,
|
||||
const int *__restrict__ seq_lens_encoder,
|
||||
int *__restrict__ batch_ids,
|
||||
int *__restrict__ tile_ids_per_batch,
|
||||
int *__restrict__ num_blocks_x, const int bsz,
|
||||
const int pad_len, const int num_row_per_block) {
|
||||
if (threadIdx.x == 0) {
|
||||
int gridx = 0;
|
||||
int index = 0;
|
||||
@@ -165,50 +169,46 @@ __global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
|
||||
}
|
||||
|
||||
template <int THREADBLOCK_SIZE>
|
||||
__global__ void get_max_len_kv_ernel(int* max_seq_lens_out,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_decoder,
|
||||
const int batch_size) {
|
||||
__global__ void
|
||||
get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
|
||||
const int *seq_lens_decoder, const int batch_size) {
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
|
||||
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
|
||||
__shared__ typename BlockReduce::TempStorage temp_storage;
|
||||
|
||||
int max_len_this_thread = 0;
|
||||
for (int i = tid; i < batch_size; i += blockDim.x) {
|
||||
if (seq_lens_decoder[i] == 0) continue;
|
||||
max_len_this_thread = max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
|
||||
if (seq_lens_decoder[i] == 0)
|
||||
continue;
|
||||
max_len_this_thread =
|
||||
max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
|
||||
}
|
||||
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||
int total =
|
||||
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
|
||||
if (tid == 0) {
|
||||
*max_seq_lens_out = total;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
||||
const paddle::Tensor& seq_lens_encoder,
|
||||
const paddle::Tensor& seq_lens_decoder,
|
||||
const paddle::Tensor& seq_lens_this_time,
|
||||
const paddle::Tensor& cum_offsets,
|
||||
const int encoder_block_shape_q,
|
||||
const int decoder_block_shape_q,
|
||||
const int group_size,
|
||||
const int block_size,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
|
||||
const int encoder_block_shape_q, const int decoder_block_shape_q,
|
||||
const int group_size, const int block_size,
|
||||
const int decoder_step_token_num) {
|
||||
auto stream = seq_lens_encoder.stream();
|
||||
int bsz = cum_offsets.shape()[0];
|
||||
auto max_len_tensor =
|
||||
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
GetMaxLen(
|
||||
seq_lens_decoder,
|
||||
seq_lens_this_time,
|
||||
seq_lens_encoder,
|
||||
max_len_tensor,
|
||||
bsz);
|
||||
GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
|
||||
max_len_tensor, bsz);
|
||||
|
||||
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time, max_enc_dec_len_this_time,
|
||||
// max_just_dec_len_this_time, max_just_dec_merged_len_this_time, max_system_len, max_just_dec_len_without_system
|
||||
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
|
||||
// max_enc_dec_len_this_time, max_just_dec_len_this_time,
|
||||
// max_just_dec_merged_len_this_time, max_system_len,
|
||||
// max_just_dec_len_without_system
|
||||
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
|
||||
auto max_len_cpu_ptr = max_len_cpu.data<int>();
|
||||
int max_len_this_time = max_len_cpu_ptr[0];
|
||||
@@ -229,67 +229,67 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
||||
paddle::Tensor decoder_batch_ids;
|
||||
paddle::Tensor decoder_tile_ids_per_batch;
|
||||
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
|
||||
paddle::Tensor max_len_kv_cpu; /*cpu*/
|
||||
paddle::Tensor max_len_kv_cpu; /*cpu*/
|
||||
|
||||
auto max_len_kv =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
|
||||
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
|
||||
max_len_kv.data<int>(),
|
||||
seq_lens_this_time.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
bsz
|
||||
);
|
||||
max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
|
||||
seq_lens_decoder.data<int>(), bsz);
|
||||
|
||||
max_len_kv_cpu =
|
||||
max_len_kv.copy_to(paddle::CPUPlace(), false);
|
||||
max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
|
||||
|
||||
if (max_enc_len_this_time > 0) {
|
||||
const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
|
||||
kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
const uint32_t max_tile_size_per_bs_kv =
|
||||
div_up(max_enc_dec_len_this_time, block_size);
|
||||
kv_batch_ids =
|
||||
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
kv_tile_ids_per_batch =
|
||||
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
auto kv_num_blocks_x =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
|
||||
split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
|
||||
seq_lens_decoder.data<int>(),
|
||||
// sequence_lengths->data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
kv_batch_ids.data<int>(),
|
||||
kv_tile_ids_per_batch.data<int>(),
|
||||
kv_num_blocks_x.data<int>(),
|
||||
bsz,
|
||||
block_size,
|
||||
block_size
|
||||
);
|
||||
seq_lens_decoder.data<int>(),
|
||||
// sequence_lengths->data<int>(),
|
||||
seq_lens_encoder.data<int>(), kv_batch_ids.data<int>(),
|
||||
kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
|
||||
block_size, block_size);
|
||||
|
||||
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||
|
||||
const uint32_t encoder_max_tile_size_per_bs_q = div_up(
|
||||
(max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
||||
const uint32_t encoder_max_tile_size_per_bs_q =
|
||||
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
||||
encoder_batch_ids =
|
||||
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
encoder_tile_ids_per_batch =
|
||||
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
auto encoder_num_blocks_x =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
|
||||
nullptr,
|
||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
|
||||
encoder_batch_ids.data<int>(),
|
||||
encoder_tile_ids_per_batch.data<int>(),
|
||||
encoder_num_blocks_x.data<int>(),
|
||||
bsz,
|
||||
encoder_block_shape_q,
|
||||
group_size);
|
||||
encoder_num_blocks_x.data<int>(), bsz,
|
||||
encoder_block_shape_q, group_size);
|
||||
encoder_num_blocks_x_cpu =
|
||||
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||
} else {
|
||||
encoder_batch_ids =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
encoder_tile_ids_per_batch =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
encoder_num_blocks_x_cpu =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
|
||||
kv_batch_ids =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
kv_tile_ids_per_batch =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
kv_num_blocks_x_cpu =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
}
|
||||
if (max_just_dec_len_this_time > 0) {
|
||||
const uint32_t decoder_max_tile_size_per_bs_q =
|
||||
@@ -297,24 +297,26 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
||||
|
||||
decoder_batch_ids =
|
||||
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
decoder_tile_ids_per_batch =
|
||||
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
|
||||
paddle::DataType::INT32,
|
||||
seq_lens_encoder.place());
|
||||
paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
auto decoder_num_blocks_x =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
decoder_batch_ids.data<int>(),
|
||||
decoder_tile_ids_per_batch.data<int>(),
|
||||
decoder_num_blocks_x.data<int>(),
|
||||
bsz,
|
||||
decoder_block_shape_q,
|
||||
group_size);
|
||||
split_q_block<<<1, 32, 0, stream>>>(
|
||||
seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
|
||||
decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
|
||||
decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
|
||||
group_size);
|
||||
decoder_num_blocks_x_cpu =
|
||||
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
|
||||
} else {
|
||||
decoder_batch_ids =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
decoder_tile_ids_per_batch =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
decoder_num_blocks_x_cpu =
|
||||
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
|
||||
}
|
||||
|
||||
return {encoder_batch_ids,
|
||||
@@ -331,28 +333,22 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
|
||||
const paddle::DataType& seq_lens_encoder_dtype,
|
||||
const paddle::DataType& seq_lens_decoder_dtype,
|
||||
const paddle::DataType& seq_lens_this_time_dtype,
|
||||
const paddle::DataType& cum_offsets_dtype) {
|
||||
return {paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32,
|
||||
paddle::DataType::INT32};
|
||||
const paddle::DataType &seq_lens_encoder_dtype,
|
||||
const paddle::DataType &seq_lens_decoder_dtype,
|
||||
const paddle::DataType &seq_lens_this_time_dtype,
|
||||
const paddle::DataType &cum_offsets_dtype) {
|
||||
return {
|
||||
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
|
||||
paddle::DataType::INT32, paddle::DataType::INT32};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
|
||||
const std::vector<int64_t>& seq_lens_encoder_shape,
|
||||
const std::vector<int64_t>& seq_lens_decoder_shape,
|
||||
const std::vector<int64_t>& seq_lens_this_time_shape,
|
||||
const std::vector<int64_t>& cum_offsets_shape) {
|
||||
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||
const std::vector<int64_t> &seq_lens_this_time_shape,
|
||||
const std::vector<int64_t> &cum_offsets_shape) {
|
||||
std::vector<int64_t> dynamic_shape = {-1};
|
||||
|
||||
return {dynamic_shape,
|
||||
@@ -369,9 +365,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
|
||||
.Inputs({"seq_lens_encoder",
|
||||
"seq_lens_decoder",
|
||||
"seq_lens_this_time",
|
||||
.Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
|
||||
"cum_offsets"})
|
||||
.Outputs({paddle::Optional("encoder_batch_ids"),
|
||||
paddle::Optional("encoder_tile_ids_per_batch"),
|
||||
@@ -382,12 +376,9 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
|
||||
paddle::Optional("decoder_batch_ids"),
|
||||
paddle::Optional("decoder_tile_ids_per_batch"),
|
||||
paddle::Optional("decoder_num_blocks"),
|
||||
paddle::Optional("max_len_kv"),
|
||||
"set_max_lengths"})
|
||||
.Attrs({"encoder_block_shape_q: int",
|
||||
"decoder_block_shape_q: int",
|
||||
"group_size: int",
|
||||
"block_size: int",
|
||||
paddle::Optional("max_len_kv"), "set_max_lengths"})
|
||||
.Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
|
||||
"group_size: int", "block_size: int",
|
||||
"decoder_step_token_num: int"})
|
||||
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
|
||||
|
@@ -337,6 +337,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
} else if (deal_each_time == 64) { \
|
||||
constexpr size_t DEAL_EACH_TIME = 64; \
|
||||
__VA_ARGS__ \
|
||||
} else { \
|
||||
PD_THROW("not support the deal_each_time", deal_each_time); \
|
||||
}
|
||||
|
||||
#define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
|
||||
@@ -346,6 +348,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
} else if (num_threads == 256) { \
|
||||
constexpr size_t NUM_THREADS = 256; \
|
||||
__VA_ARGS__ \
|
||||
} else { \
|
||||
PD_THROW("not support the num_threads", num_threads); \
|
||||
}
|
||||
|
||||
#define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
|
||||
@@ -376,6 +380,11 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
} else if (group_size == 12) { \
|
||||
constexpr size_t GROUP_SIZE = 12; \
|
||||
__VA_ARGS__ \
|
||||
} else if (group_size == 16) { \
|
||||
constexpr size_t GROUP_SIZE = 16; \
|
||||
__VA_ARGS__ \
|
||||
} else { \
|
||||
PD_THROW("not support the group_size", group_size); \
|
||||
}
|
||||
|
||||
#define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \
|
||||
|
@@ -13,7 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#include "pybind11/pybind11.h"
|
||||
namespace py = pybind11;
|
||||
|
||||
// 自定义异常类,用于处理CUDA错误
|
||||
@@ -125,45 +125,40 @@ paddle::Tensor FusedExpertMoeFunc(
|
||||
const bool norm_topk_prob, const bool group_moe);
|
||||
|
||||
std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
const paddle::Tensor& input,
|
||||
const paddle::Tensor& gating_output,
|
||||
const paddle::optional<paddle::Tensor>& gating_correction_bias,
|
||||
const paddle::optional<paddle::Tensor> &w4a8_in_scale,
|
||||
const int moe_topk,
|
||||
const bool group_moe,
|
||||
const bool topk_only_mode);
|
||||
const paddle::Tensor &input, const paddle::Tensor &gating_output,
|
||||
const paddle::optional<paddle::Tensor> &gating_correction_bias,
|
||||
const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
|
||||
const bool group_moe, const bool topk_only_mode);
|
||||
|
||||
std::vector<paddle::Tensor>
|
||||
MoETopKSelectKernel(const paddle::Tensor &gating_logits,
|
||||
const paddle::optional<paddle::Tensor> &bias,
|
||||
const int moe_topk, const bool apply_norm_weight,
|
||||
const bool enable_softmax_top_k_fused);
|
||||
const paddle::optional<paddle::Tensor> &bias,
|
||||
const int moe_topk, const bool apply_norm_weight,
|
||||
const bool enable_softmax_top_k_fused);
|
||||
|
||||
std::vector<paddle::Tensor> MoERedundantTopKSelectKernel(
|
||||
const paddle::Tensor& gating_logits,
|
||||
const paddle::Tensor& expert_id_to_ep_rank_array,
|
||||
const paddle::Tensor& expert_in_rank_num_list,
|
||||
paddle::Tensor& tokens_per_expert_stats_list,
|
||||
const paddle::optional<paddle::Tensor>& bias,
|
||||
const int moe_topk,
|
||||
const bool apply_norm_weight,
|
||||
const bool enable_softmax_top_k_fused,
|
||||
const int redundant_ep_rank_num_plus_one);
|
||||
std::vector<paddle::Tensor>
|
||||
MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
|
||||
const paddle::Tensor &expert_id_to_ep_rank_array,
|
||||
const paddle::Tensor &expert_in_rank_num_list,
|
||||
paddle::Tensor &tokens_per_expert_stats_list,
|
||||
const paddle::optional<paddle::Tensor> &bias,
|
||||
const int moe_topk, const bool apply_norm_weight,
|
||||
const bool enable_softmax_top_k_fused,
|
||||
const int redundant_ep_rank_num_plus_one);
|
||||
|
||||
std::vector<paddle::Tensor>
|
||||
EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
|
||||
const paddle::Tensor &topk_weights,
|
||||
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
|
||||
const std::vector<int> &token_nums_per_expert,
|
||||
const int token_nums_this_rank,
|
||||
const std::string &moe_quant_type);
|
||||
const paddle::Tensor &topk_weights,
|
||||
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
|
||||
const std::vector<int> &token_nums_per_expert,
|
||||
const int token_nums_this_rank,
|
||||
const std::string &moe_quant_type);
|
||||
|
||||
std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
|
||||
const paddle::Tensor &input, const paddle::Tensor &scale,
|
||||
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
|
||||
const std::vector<int> &token_nums_per_expert,
|
||||
const std::vector<int> &token_nums_per_expert_padded,
|
||||
const int token_nums_this_rank, const int token_nums_this_rank_padded);
|
||||
const paddle::Tensor &token_nums_per_expert,
|
||||
const paddle::Tensor &token_nums_per_expert_padded);
|
||||
|
||||
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
|
||||
const int block_size);
|
||||
@@ -180,20 +175,35 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
|
||||
const paddle::optional<paddle::Tensor> &ffn2_bias,
|
||||
const bool norm_topk_prob, const float routed_scaling_factor);
|
||||
|
||||
std::vector<std::vector<int>> GetExpertTokenNum(
|
||||
const paddle::Tensor& topk_ids,
|
||||
const int num_experts);
|
||||
std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
|
||||
const int num_experts);
|
||||
|
||||
paddle::Tensor MoeExpertFFNFunc(
|
||||
const paddle::Tensor &permute_input,
|
||||
const paddle::Tensor &tokens_expert_prefix_sum,
|
||||
const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor> &ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor> &ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor> &ffn2_scale,
|
||||
const paddle::optional<paddle::Tensor> &ffn2_in_scale,
|
||||
const paddle::optional<paddle::Tensor> &expert_idx_per_token,
|
||||
const std::string &quant_method, const bool used_in_ep_low_latency);
|
||||
const paddle::Tensor& permute_input,
|
||||
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||
const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_in_scale,
|
||||
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
|
||||
const std::string& quant_method, const bool used_in_ep_low_latency);
|
||||
|
||||
paddle::Tensor MoeExpertFFNWint2Func(
|
||||
const paddle::Tensor& permute_input,
|
||||
const paddle::Tensor& tokens_expert_prefix_sum,
|
||||
const paddle::Tensor& ffn1_weight,
|
||||
const paddle::Tensor& ffn2_weight,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_bias,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_local_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_code_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn1_code_zp,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_local_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_code_scale,
|
||||
const paddle::optional<paddle::Tensor>& ffn2_code_zp,
|
||||
const bool used_in_ep_low_latency);
|
||||
|
||||
paddle::Tensor MoeExpertReduceFunc(
|
||||
const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
|
||||
@@ -205,19 +215,16 @@ paddle::Tensor MoeExpertReduceFunc(
|
||||
void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
|
||||
const paddle::Tensor &seq_lens_this_time_tensor,
|
||||
const paddle::Tensor &seq_lens_decoder_tensor,
|
||||
const int rank,
|
||||
const int num_layers);
|
||||
const int rank, const int num_layers);
|
||||
|
||||
void GetOutputKVSignal(const paddle::Tensor& x,
|
||||
int64_t rank_id,
|
||||
void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id,
|
||||
bool wait_flag);
|
||||
|
||||
|
||||
paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
|
||||
const paddle::Tensor &out_scale,
|
||||
std::string dtype);
|
||||
|
||||
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
|
||||
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
|
||||
const bool keep_pd_step_flag);
|
||||
|
||||
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
|
||||
@@ -286,61 +293,121 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
|
||||
const paddle::Tensor &seq_lens_this_time,
|
||||
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
|
||||
|
||||
std::vector<paddle::Tensor> MoEDeepGEMMPermute(
|
||||
const paddle::Tensor& x,
|
||||
const paddle::Tensor& topk_idx,
|
||||
const int num_experts,
|
||||
const int max_tokens_per_expert
|
||||
);
|
||||
std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
|
||||
const paddle::Tensor &topk_idx,
|
||||
const int num_experts,
|
||||
const int max_tokens_per_expert);
|
||||
|
||||
std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
|
||||
const paddle::Tensor& ffn_out, // [num_experts, max_tokens_per_expert, hidden]
|
||||
const paddle::Tensor& permute_indices_per_token, // [token_num, topk}]
|
||||
const paddle::Tensor& topk_idx,
|
||||
const paddle::Tensor& topk_weights
|
||||
);
|
||||
const paddle::Tensor
|
||||
&ffn_out, // [num_experts, max_tokens_per_expert, hidden]
|
||||
const paddle::Tensor &permute_indices_per_token, // [token_num, topk}]
|
||||
const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
|
||||
|
||||
void TextImageIndexOut(const paddle::Tensor &token_type_ids,
|
||||
const paddle::Tensor &text_input,
|
||||
const paddle::Tensor &image_input);
|
||||
|
||||
void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
|
||||
paddle::Tensor &image_input,
|
||||
paddle::Tensor &token_type_ids,
|
||||
paddle::Tensor &text_index,
|
||||
paddle::Tensor &image_index, const bool is_scatter);
|
||||
|
||||
paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
|
||||
int64_t num_experts);
|
||||
|
||||
|
||||
std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M);
|
||||
|
||||
|
||||
std::vector<paddle::Tensor> MoeWna16MarlinGemmApi(
|
||||
const paddle::Tensor& a,
|
||||
const paddle::optional<paddle::Tensor>& c_or_none,
|
||||
const paddle::Tensor& b_q_weight,
|
||||
const paddle::Tensor& b_scales,
|
||||
const paddle::optional<paddle::Tensor>& global_scale_or_none,
|
||||
const paddle::optional<paddle::Tensor>& b_zeros_or_none,
|
||||
const paddle::optional<paddle::Tensor>& g_idx_or_none,
|
||||
const paddle::optional<paddle::Tensor>& perm_or_none,
|
||||
const paddle::Tensor& workspace,
|
||||
const paddle::Tensor& sorted_token_ids,
|
||||
const paddle::Tensor& expert_ids,
|
||||
const paddle::Tensor& num_tokens_post_padded,
|
||||
const paddle::Tensor& topk_weights,
|
||||
int64_t moe_block_size,
|
||||
int64_t top_k,
|
||||
bool mul_topk_weights,
|
||||
bool is_ep,
|
||||
const std::string& b_q_type_str,
|
||||
int64_t size_m,
|
||||
int64_t size_n,
|
||||
int64_t size_k,
|
||||
bool is_k_full,
|
||||
bool use_atomic_add,
|
||||
bool use_fp32_reduce,
|
||||
bool is_zp_float);
|
||||
void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a,
|
||||
paddle::Tensor const &b, paddle::Tensor const &a_scales,
|
||||
paddle::Tensor const &b_scales,
|
||||
paddle::optional<paddle::Tensor> const &bias);
|
||||
|
||||
void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a,
|
||||
paddle::Tensor const& b,
|
||||
paddle::Tensor const& a_scales,
|
||||
paddle::Tensor const& b_scales,
|
||||
paddle::Tensor const& azp_adj,
|
||||
paddle::optional<paddle::Tensor> const& azp,
|
||||
paddle::optional<paddle::Tensor> const& bias);
|
||||
|
||||
void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
|
||||
paddle::Tensor const &scale);
|
||||
|
||||
void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
|
||||
paddle::Tensor &scale);
|
||||
|
||||
void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out,
|
||||
paddle::Tensor const &input,
|
||||
paddle::Tensor &scales, float scale_ub);
|
||||
|
||||
PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
|
||||
m.def("get_expert_token_num", &GetExpertTokenNum,
|
||||
py::arg("topk_ids"), py::arg("num_experts"),
|
||||
"get expert token num");
|
||||
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
|
||||
py::arg("num_experts"), "get expert token num");
|
||||
|
||||
/**
|
||||
* moe/fused_moe/moe_redundant_topk_select.cu
|
||||
* moe_redundant_topk_select
|
||||
*/
|
||||
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
|
||||
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
|
||||
py::arg("expert_in_rank_num_list"),
|
||||
py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
|
||||
py::arg("moe_topk"), py::arg("apply_norm_weight"),
|
||||
py::arg("enable_softmax_top_k_fused"),
|
||||
py::arg("redundant_ep_rank_num_plus_one"),
|
||||
"moe export RedundantTopKSelect function");
|
||||
|
||||
/**
|
||||
* moe/fused_moe/moe_redundant_topk_select.cu
|
||||
* moe_redundant_topk_select
|
||||
*/
|
||||
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
|
||||
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
|
||||
py::arg("expert_in_rank_num_list"), py::arg("tokens_per_expert_stats_list"),
|
||||
py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"),
|
||||
py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"),
|
||||
"moe export RedundantTopKSelect function");
|
||||
/**
|
||||
* open_shm_and_get_meta_signal.cc
|
||||
* InitKVSignalPerQuery
|
||||
*/
|
||||
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
|
||||
py::arg("seq_lens_encoder_tensor"),
|
||||
py::arg("seq_lens_this_time_tensor"),
|
||||
py::arg("seq_lens_decoder_tensor"), py::arg("rank"),
|
||||
py::arg("num_layers"), "init_kv_signal_per_query function");
|
||||
|
||||
/**
|
||||
* GetOutputKVSignal
|
||||
*/
|
||||
m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"),
|
||||
py::arg("rank_id"), py::arg("wait_flag"),
|
||||
"get_output_kv_signal function");
|
||||
|
||||
/**
|
||||
* open_shm_and_get_meta_signal.cc
|
||||
* InitKVSingnalPerQuery
|
||||
*/
|
||||
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
|
||||
py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"),
|
||||
py::arg("seq_lens_decoder_tensor"), py::arg("rank"), py::arg("num_layers"),
|
||||
"init_kv_signal_per_query function");
|
||||
|
||||
/**
|
||||
* GetOutputKVSignal
|
||||
*/
|
||||
m.def("get_output_kv_signal", &GetOutputKVSignal,
|
||||
py::arg("x"), py::arg("rank_id"), py::arg("wait_flag"),
|
||||
"get_output_kv_signal function");
|
||||
|
||||
|
||||
|
||||
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
|
||||
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
|
||||
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
|
||||
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute,
|
||||
"MoEDeepGEMMDePermute");
|
||||
/**
|
||||
* alloc_cache_pinned.cc
|
||||
* cuda_host_alloc
|
||||
@@ -398,12 +465,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
|
||||
py::arg("moe_quant_type"), "ep moe export dispatch function");
|
||||
|
||||
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8, py::arg("input"),
|
||||
py::arg("scale"), py::arg("topk_ids"), py::arg("topk_weights"),
|
||||
py::arg("token_nums_per_expert"),
|
||||
py::arg("token_nums_per_expert_padded"),
|
||||
py::arg("token_nums_this_rank"), py::arg("token_nums_this_rank_padded"),
|
||||
"ep moe export dispatch function");
|
||||
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8);
|
||||
|
||||
m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
|
||||
py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
|
||||
@@ -437,6 +499,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
*/
|
||||
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
|
||||
|
||||
/**
|
||||
* moe/fused_moe/moe_ffn_wint2.cu
|
||||
* moe_expert_ffn_wint2
|
||||
*/
|
||||
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
|
||||
|
||||
/**
|
||||
* moe/fused_moe/moe_expert_reduce.cu
|
||||
* moe_expert_reduce
|
||||
@@ -523,4 +591,66 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
|
||||
m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
|
||||
"group_swiglu_with_masked function");
|
||||
|
||||
m.def("text_image_index_out", &TextImageIndexOut,
|
||||
"text_image_index_out function");
|
||||
|
||||
m.def("text_image_gather_scatter", &TextImageGatherScatter,
|
||||
"text_image_gather_scatter function");
|
||||
|
||||
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
|
||||
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
|
||||
|
||||
m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
|
||||
py::arg("a"),
|
||||
py::arg("c_or_none"),
|
||||
py::arg("b_q_weight"),
|
||||
py::arg("b_scales"),
|
||||
py::arg("global_scale_or_none"),
|
||||
py::arg("b_zeros_or_none"),
|
||||
py::arg("g_idx_or_none"),
|
||||
py::arg("perm_or_none"),
|
||||
py::arg("workspace"),
|
||||
py::arg("sorted_token_ids"),
|
||||
py::arg("expert_ids"),
|
||||
py::arg("num_tokens_post_padded"),
|
||||
py::arg("topk_weights"),
|
||||
py::arg("moe_block_size"),
|
||||
py::arg("top_k"),
|
||||
py::arg("mul_topk_weights"),
|
||||
py::arg("is_ep"),
|
||||
py::arg("b_q_type_str"),
|
||||
py::arg("size_m"),
|
||||
py::arg("size_n"),
|
||||
py::arg("size_k"),
|
||||
py::arg("is_k_full"),
|
||||
py::arg("use_atomic_add"),
|
||||
py::arg("use_fp32_reduce"),
|
||||
py::arg("is_zp_float"));
|
||||
|
||||
|
||||
/**
|
||||
* cutlass_scaled_mm.cu
|
||||
* cutlass_scaled_mm
|
||||
* cutlass_scaled_mm_azp
|
||||
*/
|
||||
m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function");
|
||||
m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function");
|
||||
|
||||
/**
|
||||
* quantization/common.cu
|
||||
* static_scaled_fp8_quant
|
||||
* dynamic_scaled_fp8_quant
|
||||
* dynamic_per_token_scaled_fp8_quant
|
||||
*/
|
||||
m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function",
|
||||
py::arg("out"), py::arg("input"), py::arg("scale"));
|
||||
|
||||
m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant,
|
||||
"dynamic_scaled_fp8_quant function",
|
||||
py::arg("out"), py::arg("input"), py::arg("scale"));
|
||||
|
||||
m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
|
||||
"dynamic_per_token_scaled_fp8_quant function",
|
||||
py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
|
||||
}
|
250
custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
Normal file
250
custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
Normal file
@@ -0,0 +1,250 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief Architecture-specific operators on memory added for SM80
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/complex.h"
|
||||
#include "cutlass/arch/memory.h"
|
||||
#include "cutlass/arch/memory_sm75.h"
|
||||
#include "cutlass/arch/memory_sm80.h"
|
||||
#include "cutlass/arch/cache_operation.h"
|
||||
|
||||
namespace cutlass {
|
||||
namespace arch {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initiates an asynchronous copy from global memory to shared memory.
|
||||
///
|
||||
/// cp.async
|
||||
///
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes,
|
||||
/// Cache operation
|
||||
CacheOperation::Kind cache_op = CacheOperation::Always,
|
||||
bool GlobalToShared = true>
|
||||
struct copy;
|
||||
|
||||
/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
|
||||
/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
|
||||
///
|
||||
/// cp.async
|
||||
///
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes,
|
||||
/// Cache operation
|
||||
CacheOperation::Kind cache_op = CacheOperation::Always,
|
||||
bool GlobalToShared = true>
|
||||
struct copy_zfill;
|
||||
|
||||
/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
|
||||
///
|
||||
/// cp.async
|
||||
///
|
||||
template <int N, bool GlobalToShared = true>
|
||||
struct copy_wait;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy<SizeInBytes, CacheOperation::Always, true> {
|
||||
|
||||
/// Copy
|
||||
CUTLASS_DEVICE
|
||||
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
cp_async<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy<SizeInBytes, CacheOperation::Always, false> {
|
||||
|
||||
/// Copy
|
||||
CUTLASS_DEVICE
|
||||
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||
|
||||
if (pred_guard) {
|
||||
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy_zfill<SizeInBytes, CacheOperation::Always, true> {
|
||||
|
||||
/// Copy with zero fill
|
||||
CUTLASS_DEVICE
|
||||
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
|
||||
cp_async_zfill<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy_zfill<SizeInBytes, CacheOperation::Always, false> {
|
||||
|
||||
/// Copy with zero fill
|
||||
CUTLASS_DEVICE
|
||||
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
|
||||
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||
|
||||
if (pred_guard) {
|
||||
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||
}
|
||||
else {
|
||||
AccessType zeros;
|
||||
zeros.clear();
|
||||
*static_cast<AccessType *>(smem_ptr) = zeros;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy<SizeInBytes, CacheOperation::Global, true> {
|
||||
|
||||
/// Copy
|
||||
CUTLASS_DEVICE
|
||||
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
cp_async<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy<SizeInBytes, CacheOperation::Global, false> {
|
||||
|
||||
/// Copy
|
||||
CUTLASS_DEVICE
|
||||
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||
|
||||
if (pred_guard) {
|
||||
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy_zfill<SizeInBytes, CacheOperation::Global, true> {
|
||||
|
||||
/// Copy with zero fill
|
||||
CUTLASS_DEVICE
|
||||
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
cp_async_zfill<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
|
||||
}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <
|
||||
/// Size of the access in bytes
|
||||
int SizeInBytes>
|
||||
struct copy_zfill<SizeInBytes, CacheOperation::Global, false> {
|
||||
|
||||
/// Copy with zero fill
|
||||
CUTLASS_DEVICE
|
||||
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
|
||||
using AccessType = Array<uint8_t, SizeInBytes>;
|
||||
|
||||
if (pred_guard) {
|
||||
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
|
||||
}
|
||||
else {
|
||||
AccessType zeros;
|
||||
zeros.clear();
|
||||
*static_cast<AccessType *>(smem_ptr) = zeros;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
|
||||
template <bool GlobalToShared>
|
||||
CUTLASS_DEVICE
|
||||
void copy_fence() {}
|
||||
|
||||
template <>
|
||||
CUTLASS_DEVICE
|
||||
void copy_fence<true>() {
|
||||
cp_async_fence();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Partial specialization
|
||||
template <int N>
|
||||
struct copy_wait<N, false> {
|
||||
|
||||
CUTLASS_DEVICE
|
||||
copy_wait() {}
|
||||
};
|
||||
|
||||
/// Partial specialization
|
||||
template <int N>
|
||||
struct copy_wait<N, true> {
|
||||
|
||||
CUTLASS_DEVICE
|
||||
copy_wait() { cp_async_wait<N>(); }
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace arch
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user