Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

29
.clang-format Normal file
View File

@@ -0,0 +1,29 @@
# This file is used by clang-format to autoformat paddle source code
#
# The clang-format is part of llvm toolchain.
# It need to install llvm and clang to format source code style.
#
# The basic usage is,
# clang-format -i -style=file PATH/TO/SOURCE/CODE
#
# The -style=file implicit use ".clang-format" file located in one of
# parent directory.
# The -i means inplace change.
#
# The document of clang-format is
# http://clang.llvm.org/docs/ClangFormat.html
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
Language: Cpp
BasedOnStyle: Google
IndentWidth: 4
TabWidth: 2
ContinuationIndentWidth: 4
AccessModifierOffset: -1 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
IncludeBlocks: Preserve
IncludeIsMainSourceRegex: (\.cu)$
...

6
.gitignore vendored
View File

@@ -121,7 +121,7 @@ dmypy.json
FETCH_HEAD
#log
log/
log*/
checkpoints/
checkpoints_origin/
@@ -158,3 +158,7 @@ custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
# buff
custom_ops/tmp*
build
.ccls-cache

View File

@@ -16,7 +16,7 @@ repos:
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix]
args: [--output-format, github, --fix, --line-length=120]
# # 拼写检查
# - repo: https://github.com/codespell-project/codespell
# rev: v2.4.1
@@ -29,14 +29,15 @@ repos:
rev: 6.0.1
hooks:
- id: isort
# 格式化
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3
hooks:
- id: clang-format
# exclude: '.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
# # 格式化
# - repo: https://github.com/pre-commit/mirrors-clang-format
# rev: v20.1.3
# hooks:
# - id: clang-format
# # exclude: '.*'
# types_or: [c++, cuda]
# args: [--style=file, --verbose]
# markdown
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29

156
README.md
View File

@@ -1,9 +1,8 @@
# FastDeploy 2.0: 大模型推理部署
<p align="center">
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
<a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
</p>
<p align="center">
<a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
@@ -11,105 +10,78 @@
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
</p>
FastDeploy升级2.0版本支持多种大模型推理当前仅支持Qwen2更多模型即将更新支持),其推理部署功能涵盖:
<p align="center">
<a href="docs/get_started/installation/README.md"><b> Installation </b></a>
|
<a href="docs/get_started.md"><b> Quick Start </b></a>
|
<a href="docs/supported_models.md"><b> Supported Models </b></a>
</p>
- 一行命令即可快速实现模型的服务化部署,并支持流式生成
- 利用张量并行技术加速模型推理
- 支持 PagedAttention 与 continuous batching动态批处理
- 兼容 OpenAI 的 HTTP 协议
- 提供 Weight only int8/int4 无损压缩方案
- 支持 Prometheus Metrics 指标
--------------------------------------------------------------------------------
# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
> 注意: 如果你还在使用FastDeploy部署小模型(如PaddleClas/PaddleOCR等CV套件模型)请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
## News
## 环境依赖
- A800/H800/H100
- Python>=3.10
- CUDA>=12.3
- CUDNN>=9.5
- Linux X64
**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
## 安装
## About
### Docker安装(推荐)
```
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy:2.0.0.0-alpha
```
**FastDeploy** is an inference and deployment toolkit for large language models and visual language models based on PaddlePaddle. It delivers **production-ready, out-of-the-box deployment solutions** with core acceleration technologies:
### 源码安装
#### 安装PaddlePaddle
> 注意安装nightly build版本代码版本需新于2025.05.30,详见[PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)指定安装CUDA 12.6 develop(Nightly build)版本。
```
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
```
- 🚀 **Load-Balanced PD Disaggregation**: Industrial-grade solution featuring context caching and dynamic instance role switching. Optimizes resource utilization while balancing SLO compliance and throughput.
- 🔄 **Unified KV Cache Transmission**: Lightweight high-performance transport library with intelligent NVLink/RDMA selection.
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
-**Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
#### 编译安装FastDeploy
## Requirements
```
# 编译
cd FastDeploy
bash build.sh
# 安装
pip install dist/fastdeploy-2.0.0a0-py3-none-any.whl
```
- OS: Linux
- Python: 3.10 ~ 3.12
## 快速使用
## Installation
在安装后执行如下命令快速部署Qwen2模型, 更多参数的配置与含义参考[参数说明](docs/serving.md).
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
``` shell
# 下载与解压Qwen模型
wget https://fastdeploy.bj.bcebos.com/llm/models/Qwen2-7B-Instruct.tar.gz && tar xvf Qwen2-7B-Instruct.tar.gz
# 指定单卡部署
python -m fastdeploy.entrypoints.openai.api_server --model ./Qwen2-7B-Instruct --port 8188 --tensor-parallel-size 1
```
- [NVIDIA GPU](./docs/installation/nvidia_cuda.md)
- [Kunlunxin XPU](./docs/en/get_started/installation/kunlunxin_xpu.md)
- [Iluvatar GPU](./docs/en/get_started/installation/iluvatar_gpu.md)
- [Enflame GCU](./docs/en/get_started/installation/Enflame_gcu.md)
使用如下命令请求模型服务
``` shell
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "你好,你的名字是什么?"}
]
}'
```
响应结果如下所示
``` json
{
"id": "chatcmpl-db662f47-7c8c-4945-9a7a-db563b2ddd8d",
"object": "chat.completion",
"created": 1749451045,
"model": "default",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "你好!我叫通义千问。",
"reasoning_content": null
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 25,
"total_tokens": 35,
"completion_tokens": 10,
"prompt_tokens_details": null
}
}
```
FastDeploy提供与OpenAI完全兼容的服务API(字段`model`与`api_key`目前不支持,设定会被忽略)用户也可基于openai python api请求服务。
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
## 部署文档
- [本地部署](docs/offline_inference.md)
- [服务部署](docs/serving.md)
- [服务metrics](docs/metrics.md)
## Get Started
# 代码说明
- [代码目录说明](docs/code_guide.md)
- FastDeploy的使用中存在任何建议和问题欢迎通过issue反馈。
Learn how to use FastDeploy through our documentation:
- [10-Minutes Quick Deployment](./docs/get_started/quick_start.md)
- [ERNIE-4.5 Large Language Model Deployment](./docs/get_started/ernie-4.5.md)
- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
- [Offline Inference Development](./docs/offline_inference.md)
- [Online Service Deployment](./docs/serving/README.md)
- [Full Supported Models List](./docs/supported_models.md)
# 开源说明
FastDeploy遵循[Apache-2.0开源协议](./LICENSE)。 在本项目的开发中,为了对齐[vLLM](https://github.com/vllm-project/vllm)使用接口参考和直接使用了部分vLLM代码在此表示感谢。
## Supported Models
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅WINT4/W4A8C8/Expert Parallelism)| ✅ | ✅|✅(WINT4)| WIP |128K |
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅WINT4/Expert Parallelism)| ✅ | ✅|✅(WINT4)| ❌ | 128K |
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
## Advanced Usage
- [Quantization](./docs/quantization/README.md)
- [PD Disaggregation Deployment](./docs/features/pd_disaggregation.md)
- [Speculative Decoding](./docs/features/speculative_decoding.md)
- [Prefix Caching](./docs/features/prefix_caching.md)
- [Chunked Prefill](./docs/features/chunked_prefill.md)
## Acknowledgement
FastDeploy is licensed under the [Apache-2.0 open-source license](./LICENSE). During development, portions of [vLLM](https://github.com/vllm-project/vllm) code were referenced and incorporated to maintain interface compatibility, for which we express our gratitude.

106
benchmarks/README.md Normal file
View File

@@ -0,0 +1,106 @@
### FastDeploy服务化性能压测工具
#### 数据集:
wget下载到本地用于性能测试
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>开源数据集 2k条</strong></td>
<td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
</tr>
</tbody>
</table>
#### 使用方式:
```
# 安装依赖
python -m pip install -r requirements.txt
```
##### 参数说明
```bash
--backend openai-chat压测使用的后端接口指定为"openai-chat"使用chat/completion接口
--model EB45T模型名任意取名影响最后保存的结果文件名 EB45T \
--endpoint /v1/chat/completionsendpoint用于组url
--host 0.0.0.0服务ip地址用于组url
--port 9812服务HTTP端口用于组url
--dataset-name EBChat指定数据集类指定为"EBChat"可读取转存的FD格式数据集
--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd压测数据集路径
--hyperparameter-path EB45T.yaml(可选)超参文件请求时会更新进payload中默认不带任何超参
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len性能结果中展示的指标集合
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
--num-prompts 1总计发送多少条请求
--max-concurrency 1压测并发数
--save-result开启结果保存结果文件会存入json
```
##### /v1/chat/completions接口压测单条数据调试
```
python benchmark_serving.py \
--backend openai-chat \
--model EB45T \
--endpoint /v1/chat/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 1 \
--max-concurrency 1 \
--save-result
```
##### /v1/chat/completions接口完整100并发 2000条压测
```
# 保存infer_log.txt
python benchmark_serving.py \
--backend openai-chat \
--model EB45T \
--endpoint /v1/chat/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 2000 \
--max-concurrency 100 \
--save-result > infer_log.txt 2>&1 &
```
##### /v1/completions接口压测
修改endpoint为/v1/completionsbackend为openai会对/v1/completions接口进行压测
```
# 保存infer_log.txt
python benchmark_serving.py \
--backend openai \
--model EB45T \
--endpoint /v1/completions \
--host 0.0.0.0 \
--port 9812 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
--hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
--num-prompts 2000 \
--max-concurrency 100 \
--save-result > infer_log.txt 2>&1 &
```

View File

@@ -0,0 +1,700 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
import io
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import Optional
import aiohttp
from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
"""Input for requesting LLMs via API"""
prompt: str
history_QA: Optional[dict]
hyper_parameters: dict
api_url: str
prompt_len: int
output_len: int
model: str
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
language: Optional[str] = None
@dataclass
class RequestFuncOutput:
"""Output for requesting LLMs via API"""
generated_text: str = ""
reasoning_content: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
arrival_time: list = field(default_factory=list) # arrival_time
itl: list = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
prompt_tokens: int = 0 # 推理侧返回输入token数
error: str = ""
async def async_request_eb_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Chat Completions API URL must end with 'completions'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": "default",
"messages": request_func_input.history_QA,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = 0
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
reason_content = choices[0]["delta"].get("reasoning_content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# cached_tokens
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
output.generated_text += content or ""
output.reasoning_content += reason_content or ""
output.arrival_time.append(choices[0].get("arrival_time"))
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
output.prompt_tokens = usage.get(
"prompt_tokens")
most_recent_timestamp = timestamp
# output.generated_text = generated_text
if output.generated_text.strip() == "":
output.success = False
output.error = "No generated text found!"
else:
output.success = True
output.latency = most_recent_timestamp - st
else:
error_text = await response.text()
print("####error response:", error_text, "####payload:", payload)
output.error = error_text or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
# 保存失败请求结果
if not output.success:
with open("error_output.txt", "a") as f:
f.write(str(output) + "\n")
if pbar:
pbar.update(1)
return output
async def async_request_eb_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": "default",
"prompt": request_func_input.prompt,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, chunk.usage)
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(choices[0].get("arrival_time"))
generated_text += text or ""
elif usage := data.get("usage"):
output.prompt_tokens = usage.get(
"prompt_tokens")
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using the TGI API"""
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
"ignore_eos_token": request_func_input.ignore_eos,
}
payload = {
"inputs": request_func_input.prompt,
"parameters": params,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
output.output_tokens = request_func_input.output_len
else:
output.output_tokens = None
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(data["arrival_time"])
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using TRT's llm_server"""
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using Deepspeed MII"""
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
"max_tokens": request_func_input.output_len,
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url,
json=payload) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
# "temperature": 0.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
#"stream_options": {
# "include_usage": True,
#},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using OpenAI"""
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(
("transcriptions", "translations"
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
# Send audio file
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
form = aiohttp.FormData()
form.add_field('file', f, content_type='audio/wav')
for key, value in payload.items():
form.add_field(key, str(value))
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url,
data=form,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
"lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_eb_openai_completions,
"openai-chat": async_request_eb_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_eb_openai_chat_completions)
]

View File

@@ -0,0 +1,309 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
import base64
import io
import json
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Callable, Optional, Union
from PIL import Image
logger = logging.getLogger(__name__)
@dataclass
class SampleRequest:
"""
Represents a single inference request for benchmarking.
"""
prompt: Union[str, Any]
history_QA: Union[str, Any]
json_data: Optional[dict]
prompt_len: int
expected_output_len: int
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
DEFAULT_SEED = 0
IS_MULTIMODAL = False
def __init__(
self,
dataset_path: Optional[str] = None,
random_seed: int = DEFAULT_SEED,
hyperparameter_path: Optional[str] = None,
) -> None:
"""
Initialize the BenchmarkDataset with an optional dataset path and random
seed. Args:
dataset_path (Optional[str]): Path to the dataset. If None, it
indicates that a default or random dataset might be used.
random_seed (int): Seed value for reproducible shuffling or
sampling. Defaults to DEFAULT_SEED.
"""
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self.random_seed = (random_seed
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
self.hyperparameter_path = hyperparameter_path
self.hyperparameters = {}
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise NotImplementedError(
"load_data must be implemented in subclasses.")
@abstractmethod
def sample(self, num_requests: int) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
Args:
num_requests (int): The number of sample requests to generate.
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests(self, requests: list[SampleRequest],
num_requests: int) -> None:
"""
Oversamples the list of requests if its size is less than the desired
number.
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
additional = random.choices(requests,
k=num_requests - len(requests))
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.",
num_requests)
def is_valid_sequence(
prompt_len: int,
output_len: int,
min_len: int = 4,
max_prompt_len: int = 1024,
max_total_len: int = 2048,
skip_min_output_len_check: bool = False,
) -> bool:
"""
Validate a sequence based on prompt and output lengths.
Default pruning criteria are copied from the original `sample_hf_requests`
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
from `sample_requests` in benchmark_throughput.py.
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len
< min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
return not (prompt_too_short or output_too_short or prompt_too_long
or combined_too_long)
def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
if isinstance(image, str):
image_url = (image if image.startswith(
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes.")
class EBDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
temperature: float
repetition_penalty: float
frequency_penalty: float
presence_penalty: float
top_p: float
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
prompt = entry["text"]
self.temperature = float(entry["temperature"])
self.repetition_penalty = float(entry["penalty_score"])
self.frequency_penalty = float(entry["frequency_score"])
self.presence_penalty = float(entry["presence_score"])
self.top_p = float(entry["topp"])
self.prompt_len = int(entry["input_token_num"])
new_output_len = int(entry["max_dec_len"])
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
prompt_len=self.prompt_len,
history_QA=[],
expected_output_len=new_output_len,
))
self.maybe_oversample_requests(samples, num_requests)
return samples
class EBChatDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
json_data = entry
prompt = entry["messages"][-1].get("content", "")
history_QA = entry.get("messages", [])
new_output_len = int(entry.get("max_tokens", 12288))
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
json_data=json_data,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
))
self.maybe_oversample_requests(samples, num_requests)
return samples

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,90 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
import argparse
import json
import math
import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
"""InfEncoder"""
def clear_inf(self, o: Any):
"""clear_inf"""
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
"""iterencode"""
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
"""write_to_json"""
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)

View File

@@ -0,0 +1,5 @@
aiohttp
tqdm
numpy
Pillow
pyyaml

View File

@@ -0,0 +1,8 @@
enable_chunked_prefill: True
max_model_len: 131072
max_num_seqs: 16
kv_cache_ratio: 0.75
tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,5 @@
max_model_len: 131072
max_num_seqs: 40
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint4

View File

@@ -0,0 +1,8 @@
enable_chunked_prefill: True
max_model_len: 131072
max_num_seqs: 16
kv_cache_ratio: 0.75
tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,10 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint4
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 32
kv_cache_ratio: 0.5
tensor_parallel_size: 1
quantization: wint4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768
quantization: wint4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768
quantization: wint8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
max_num_batched_tokens: 32768

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 256
tensor_parallel_size: 8
quantization: block_wise_fp8
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
enable_chunked_prefill: True
max_num_batched_tokens: 1024
max_num_partial_prefills: 3
max_long_partial_prefills: 3
enable_prefix_caching: True
swap_space: 200

View File

@@ -0,0 +1,11 @@
max_model_len: 32768
max_num_seqs: 256
tensor_parallel_size: 8
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
enable_chunked_prefill: True
max_num_batched_tokens: 1024
max_num_partial_prefills: 3
max_long_partial_prefills: 3
enable_prefix_caching: True
swap_space: 200

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4

View File

@@ -0,0 +1,15 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: True
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: True
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_prefix_caching: true
enable_chunked_prefill: true

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 1
data_parallel_size: 8
num_gpu_blocks_override: 1024
cache_queue_port: 55663
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
pd_comm_port: "2334"

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 1
data_parallel_size: 8
splitwise_role: prefill
cache_queue_port: 55664
engine_worker_queue_port: 6677
num_gpu_blocks_override: 1024
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
pd_comm_port: "2334"

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
quantization: wint4

View File

@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.7
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: False
enable_prefix_caching: False
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: False
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 40
tensor_parallel_size: 4
quantization: wint4
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 160
tensor_parallel_size: 8
quantization: wint4
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,8 @@
enable_prefix_caching: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
swap_space: 200
cache_queue_port: 55664

View File

@@ -0,0 +1,15 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: True
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -0,0 +1,12 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: True
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 80
tensor_parallel_size: 8
quantization: wint8
gpu_memory_utilization: 0.9

View File

@@ -0,0 +1,9 @@
enable_prefix_caching: True
max_model_len: 32768
max_num_batched_tokens: 68304
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 8
swap_space: 100
cache_queue_port: 55664

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 56
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,11 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 56
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.95
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,11 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
quantization: wint8
limit_mm_per_prompt: '{"image": 100, "video": 100}'
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,4 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
enable_static_graph_inference: True

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
enable_static_graph_inference: True

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 75
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 4

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 25
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 4

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 50
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 1

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.8
metadata:
min_tokens: 1
max_tokens: 131071
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.8
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.7
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.05
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,8 @@
top_p: 0.8
temperature: 0.7
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 1.5

View File

@@ -0,0 +1,8 @@
top_p: 0.95
temperature: 0.6
metadata:
min_tokens: 1
max_tokens: 32767
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 8
max_model_len: 32768
max_num_seqs: 32
num_gpu_blocks_override: 4096
kv_cache_ratio: 0.5
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 32
gpu_memory_utilization: 0.9
tensor_parallel_size: 4
quantization: wint4
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint4
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,10 @@
enable_prefix_caching: True
num_gpu_blocks_override: 8000
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.5
tensor_parallel_size: 8
swap_space: 200
cache_queue_port: 55664
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 8
max_model_len: 32768
max_num_seqs: 32
num_gpu_blocks_override: 4096
kv_cache_ratio: 0.5
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 8
gpu_memory_utilization: 0.9
tensor_parallel_size: 4
quantization: wint8
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint8
reasoning_parser: ernie-x1

View File

@@ -0,0 +1,10 @@
enable_prefix_caching: True
num_gpu_blocks_override: 8000
max_model_len: 32768
max_num_seqs: 64
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.5
tensor_parallel_size: 8
swap_space: 200
cache_queue_port: 55664
reasoning_parser: ernie-x1

View File

@@ -17,8 +17,9 @@
BUILD_WHEEL=${1:-1}
PYTHON_VERSION=${2:-"python"}
export python=$PYTHON_VERSION
CPU_USE_BF16=${3:-"false"}
BUILDING_ARCS=${4:-""}
FD_CPU_USE_BF16=${3:-"false"}
FD_BUILDING_ARCS=${4:-""}
# paddle distributed use to set archs
unset PADDLE_CUDA_ARCH_LIST
@@ -30,13 +31,9 @@ EGG_DIR="fastdeploy.egg-info"
# custom_ops directory config
OPS_SRC_DIR="custom_ops"
OPS_BUILD_DIR="build"
OPS_EGG_DIR="efficitentllm_ops.egg-info"
OPS_TMP_DIR_BASE="tmp_base"
OPS_TMP_DIR="tmp"
TEST_DIR="tests"
# command line log config
RED='\033[0;31m'
BLUE='\033[0;34m'
@@ -44,13 +41,14 @@ GREEN='\033[1;32m'
BOLD='\033[1m'
NONE='\033[0m'
DEVICE_TYPE="gpu"
function python_version_check() {
PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "8" ]; then
echo -e "${RED}FAIL:${NONE} please use Python >= 3.8"
if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "9" ]; then
echo -e "${RED}FAIL:${NONE} please use Python >= 3.9"
exit 1
fi
}
@@ -75,6 +73,7 @@ function copy_ops(){
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
if [ "$is_rocm" = "True" ]; then
DEVICE_TYPE="rocm"
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "ROCM ops have been copy to fastdeploy"
return
@@ -82,6 +81,7 @@ function copy_ops(){
mkdir -p ../fastdeploy/model_executor/ops/base
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
if [ "$is_cuda" = "True" ]; then
DEVICE_TYPE="gpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "BASE and CUDA ops have been copy to fastdeploy"
@@ -90,6 +90,7 @@ function copy_ops(){
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
if [ "$is_xpu" = "True" ]; then
DEVICE_TYPE="xpu"
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
echo -e "xpu ops have been copy to fastdeploy"
return
@@ -97,20 +98,14 @@ function copy_ops(){
is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
if [ "$is_npu" = "True" ]; then
DEVICE_TYPE="npu"
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
echo -e "npu ops have been copy to fastdeploy"
return
fi
DEVICE_TYPE="cpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cd ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/xFasterTransformer/build/
for file in *_pd_.so; do
mv "$file" "${file/_pd_/}"
done
cd ../../x86-simd-sort/builddir/
for file in *_pd_.so; do
mv "$file" "${file/_pd_/}"
done
cd ../../../../
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
echo -e "BASE and CPU ops have been copy to fastdeploy"
@@ -122,15 +117,30 @@ function build_and_install_ops() {
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
if [ "$CPU_USE_BF16" == "true" ]; then
CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
:
elif [ "$CPU_USE_BF16" == "false" ]; then
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
if [ "$is_xpu" = "True" ]; then
cd xpu_ops/src
bash build.sh ${TMP_DIR_REAL_PATH}
cd ../..
elif [ "$FD_CPU_USE_BF16" == "true" ]; then
if [ "$FD_BUILDING_ARCS" == "" ]; then
FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
else
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
fi
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
elif [ "$FD_CPU_USE_BF16" == "false" ]; then
if [ "$FD_BUILDING_ARCS" == "" ]; then
${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
:
else
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
fi
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
else
echo "Error: Invalid parameter '$CPU_USE_BF16'. Please use true or false."
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
exit 1
fi
if [ $? -ne 0 ]; then
@@ -146,11 +156,7 @@ function build_and_install_ops() {
function build_and_install() {
echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
if [ "$BUILDING_ARCS" == "" ]; then
${python} setup.py bdist_wheel --python-tag py3
else
BUILDING_ARCS=${BUILDING_ARCS} ${python} setup.py bdist_wheel --python-tag py3
fi
${python} setup.py bdist_wheel --python-tag=py3
if [ $? -ne 0 ]; then
echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
@@ -174,10 +180,12 @@ function cleanup() {
rm -rf $BUILD_DIR $EGG_DIR
if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0 ]; then
echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
${python} -m pip uninstall -y fastdeploy
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
fi
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
}
function abort() {
@@ -187,7 +195,7 @@ function abort() {
cur_dir=`basename "$pwd"`
rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
${python} -m pip uninstall -y fastdeploy
${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
}

View File

@@ -0,0 +1,643 @@
From 5112002c155dceecc5e5983cdb67157e4f5400e2 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Wed, 25 Jun 2025 15:05:24 +0800
Subject: [PATCH] DeepGEMM 95e81b3
---
deep_gemm/__init__.py | 2 +-
deep_gemm/include/deep_gemm/scheduler.cuh | 2 +-
deep_gemm/jit/compiler.py | 2 +-
deep_gemm/jit/interleave_ffma.py | 2 +-
deep_gemm/jit/runtime.py | 4 +-
deep_gemm/jit/template.py | 34 ++++----
deep_gemm/jit_kernels/gemm.py | 44 +++++------
deep_gemm/jit_kernels/m_grouped_gemm.py | 96 +++++++++++------------
deep_gemm/jit_kernels/tuner.py | 10 +--
deep_gemm/jit_kernels/utils.py | 18 +++--
deep_gemm/paddle_utils.py | 20 +++++
deep_gemm/utils.py | 30 +++----
12 files changed, 143 insertions(+), 121 deletions(-)
create mode 100644 deep_gemm/paddle_utils.py
diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
index 15b22ca..63e7fb7 100644
--- a/deep_gemm/__init__.py
+++ b/deep_gemm/__init__.py
@@ -1,4 +1,4 @@
-import torch
+import paddle
from . import jit
from .jit_kernels import (
diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh
index 9743871..6c97152 100644
--- a/deep_gemm/include/deep_gemm/scheduler.cuh
+++ b/deep_gemm/include/deep_gemm/scheduler.cuh
@@ -102,7 +102,7 @@ struct Scheduler {
if constexpr (kGemmType == GemmType::Normal) {
return block_idx * block_size;
} else if constexpr (kGemmType == GemmType::GroupedContiguous) {
- auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
+ auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M));
return offset * shape_dim + block_idx * block_size;
} else if constexpr (kGemmType == GemmType::GroupedMasked) {
return curr_group_idx * shape_dim + block_idx * block_size;
diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py
index c17d466..6fdc52f 100644
--- a/deep_gemm/jit/compiler.py
+++ b/deep_gemm/jit/compiler.py
@@ -4,7 +4,7 @@ import os
import re
import subprocess
import uuid
-from torch.utils.cpp_extension import CUDA_HOME
+from ..paddle_utils import CUDA_HOME
from typing import Tuple
from . import interleave_ffma
diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/interleave_ffma.py
index fcb377e..db9d6f3 100644
--- a/deep_gemm/jit/interleave_ffma.py
+++ b/deep_gemm/jit/interleave_ffma.py
@@ -3,7 +3,7 @@ import mmap
import os
import re
import subprocess
-from torch.utils.cpp_extension import CUDA_HOME
+from ..paddle_utils import CUDA_HOME
def run_cuobjdump(file_path):
diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py
index 66c370a..4761426 100644
--- a/deep_gemm/jit/runtime.py
+++ b/deep_gemm/jit/runtime.py
@@ -1,6 +1,6 @@
import ctypes
import os
-import torch
+import paddle
from typing import Optional
from .template import map_ctype
@@ -35,7 +35,7 @@ class Runtime:
assert len(args) == len(self.args), f'Expected {len(self.args)} arguments, got {len(args)}'
cargs = []
for arg, (name, dtype) in zip(args, self.args):
- if isinstance(arg, torch.Tensor):
+ if isinstance(arg, paddle.Tensor):
assert arg.dtype == dtype, f'Expected tensor dtype `{dtype}` for `{name}`, got `{arg.dtype}`'
else:
assert isinstance(arg, dtype), f'Expected built-in type `{dtype}` for `{name}`, got `{type(arg)}`'
diff --git a/deep_gemm/jit/template.py b/deep_gemm/jit/template.py
index ead37f5..51b02c1 100644
--- a/deep_gemm/jit/template.py
+++ b/deep_gemm/jit/template.py
@@ -1,24 +1,24 @@
import copy
import ctypes
import os
-import torch
+import paddle
from typing import Any, Dict, Iterable, Tuple
# Name map for Python `eval`
typename_map: Dict[Any, str] = {
**{t: t.__name__ for t in (bool, int, float)},
- torch.int: 'torch.int',
- torch.float: 'torch.float',
- torch.bfloat16: 'torch.bfloat16',
- torch.float8_e4m3fn: 'torch.float8_e4m3fn',
- torch.cuda.Stream: 'torch.cuda.Stream',
+ paddle.int32: 'paddle.int32',
+ paddle.float32: 'paddle.float32',
+ paddle.bfloat16: 'paddle.bfloat16',
+ paddle.float8_e4m3fn: 'paddle.float8_e4m3fn',
+ paddle.device.cuda.Stream: "paddle.device.cuda.Stream",
}
# `ctype` map for Python casting
ctype_map: Dict[Any, Any] = {
**{t: getattr(ctypes, f'c_{t.__name__}') for t in (bool, int, float)},
- **{t: ctypes.c_void_p for t in (torch.int, torch.float, torch.bfloat16, torch.float8_e4m3fn, torch.cuda.Stream)},
+ **{t: ctypes.c_void_p for t in (paddle.int32, paddle.float32, paddle.bfloat16, paddle.float8_e4m3fn, paddle.device.cuda.Stream)},
}
@@ -27,25 +27,25 @@ genc_map = {
bool: ('bool', 'bool'),
int: ('int', 'int'),
float: ('float', 'float'),
- torch.int: ('void*', 'int*'),
- torch.float: ('void*', 'float*'),
- torch.bfloat16: ('void*', '__nv_bfloat16*'),
- torch.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
- torch.cuda.Stream: ('void*', 'cudaStream_t'),
+ paddle.int32: ('void*', 'int*'),
+ paddle.float32: ('void*', 'float*'),
+ paddle.bfloat16: ('void*', '__nv_bfloat16*'),
+ paddle.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
+ paddle.device.cuda.Stream: ('void*', 'cudaStream_t'),
}
def map_ctype(value: Any) -> Any:
if hasattr(value, 'data_ptr'):
- if value.dtype == torch.int:
+ if value.dtype == paddle.int32:
return ctypes.c_void_p(value.data_ptr())
- elif value.dtype == torch.float:
+ elif value.dtype == paddle.float32:
return ctypes.c_void_p(value.data_ptr())
- elif value.dtype == torch.bfloat16:
+ elif value.dtype == paddle.bfloat16:
return ctypes.c_void_p(value.data_ptr())
- elif value.dtype == torch.float16:
+ elif value.dtype == paddle.float16:
return ctypes.c_void_p(value.data_ptr())
- elif value.dtype == torch.float8_e4m3fn:
+ elif value.dtype == paddle.float8_e4m3fn:
return ctypes.c_void_p(value.data_ptr())
else:
return ctypes.c_void_p(value.data_ptr())
diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py
index cb438b7..44aa0ed 100644
--- a/deep_gemm/jit_kernels/gemm.py
+++ b/deep_gemm/jit_kernels/gemm.py
@@ -1,5 +1,5 @@
import math
-import torch
+import paddle
from functools import lru_cache
from typing import Tuple
@@ -166,20 +166,20 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config
-def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
- rhs: Tuple[torch.Tensor, torch.Tensor],
- out: torch.Tensor) -> None:
+def gemm_fp8_fp8_bf16_nt(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
+ out: paddle.Tensor) -> None:
"""
Do a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
RHS and RHS scaling factors are required to be transposed.
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
- this function will do a transposing with a set of slow PyTorch operations.
+ this function will do a transposing with a set of slow paddle operations.
Arguments:
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m, k]`,
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[n, k]`.
the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`.
out: the BF16 output tensor of shape `[m, n]`, representing the result.
"""
@@ -189,22 +189,22 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
n, k_ = rhs.shape
m_, n_ = out.shape
- assert n % 64 == 0 and k % 128 == 0
+ # assert n % 64 == 0 and k % 128 == 0
# Type and shape checks
- assert m == m_ and n == n_ and k == k_
- assert n > 0 and k > 0
- assert lhs_scales.shape == (m, (k + 127) // 128)
- assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
- assert out.dtype == torch.bfloat16
- assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
+ # assert m == m_ and n == n_ and k == k_
+ # assert n > 0 and k > 0
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
+ # assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+ # assert out.dtype == paddle.bfloat16
+ # assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
# LHS scales must be transposed for TMA load, but not for RHS scales
# NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
- assert rhs_scales.is_contiguous()
+ # assert rhs_scales.is_contiguous()
# Do nothing if `m` is zero
if m == 0:
@@ -214,7 +214,7 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
global includes, template
num_sms = get_num_sms()
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms)
- args = (lhs, lhs_scales, rhs, rhs_scales, out, m, torch.cuda.current_stream(), num_sms, smem_config[0])
+ args = (lhs, lhs_scales, rhs, rhs_scales, out, m, paddle.device.cuda.current_stream(), num_sms, smem_config[0])
runtime = jit_tuner.compile_and_tune(
name='gemm_fp8_fp8_bf16_nt',
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -225,10 +225,10 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
space=(),
includes=includes,
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
- ('out', torch.bfloat16), ('m', int),
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+ ('out', paddle.bfloat16), ('m', int),
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
template=template,
args=args
)
diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py
index 3b518c9..ba776bd 100644
--- a/deep_gemm/jit_kernels/m_grouped_gemm.py
+++ b/deep_gemm/jit_kernels/m_grouped_gemm.py
@@ -1,4 +1,4 @@
-import torch
+import paddle
from typing import Tuple
from .gemm import get_best_configs, get_block_n_padding_for_smem_d
@@ -37,25 +37,25 @@ gemm_t::run(out, rhs_scales, grouped_layout,
"""
-def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor],
- rhs: Tuple[torch.Tensor, torch.Tensor],
- out: torch.Tensor, m_indices: torch.Tensor) -> None:
+def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
+ out: paddle.Tensor, m_indices: paddle.Tensor) -> None:
"""
Do a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
RHS and RHS scaling factors are required to be transposed.
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
- this function will do a transposing with a set of slow PyTorch operations.
+ this function will do a transposing with a set of slow Pypaddle operations.
On the M axis, inputs are grouped into several batches, of which batch sizes aligned to
`get_m_alignment_for_contiguous_layout()` (128).
Arguments:
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`,
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m_sum, k]`,
the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`.
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
out: the BF16 output tensor of shape `[m_sum, n]`, representing the result.
- m_indices: a tensor of shape `[m_sum]` with type `torch.int`.
+ m_indices: a tensor of shape `[m_sum]` with type `paddle.int`.
`m_indices[i]` records the group which the i-th row of the LHS belong to,
which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`.
Values of `m_indices` in every-m-alignment-block must also be the same.
@@ -68,19 +68,19 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
m__ = m_indices.numel()
# Type and shape checks
- assert m == m_ == m__ and k == k_ and n == n_
- assert lhs_scales.shape == (m, (k + 127) // 128)
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
- assert out.dtype == torch.bfloat16
- assert m_indices.dtype == torch.int32
- assert lhs.is_contiguous() and rhs.is_contiguous()
- assert out.is_contiguous() and m_indices.is_contiguous()
+ # assert m == m_ == m__ and k == k_ and n == n_
+ # assert lhs_scales.shape == (m, (k + 127) // 128)
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+ # assert out.dtype == paddle.bfloat16
+ # assert m_indices.dtype == paddle.int32
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
+ # assert out.is_contiguous() and m_indices.is_contiguous()
# LHS scales must be transposed for TMA load, but not for RHS scales
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
- assert rhs_scales.is_contiguous()
+ # assert rhs_scales.is_contiguous()
# Do nothing if `m` is zero
if m == 0:
@@ -92,7 +92,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms, is_grouped_contiguous=True)
args = (lhs, lhs_scales, rhs, rhs_scales, out,
m_indices, m, num_groups,
- torch.cuda.current_stream(), num_sms, smem_config[0])
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
runtime = jit_tuner.compile_and_tune(
name='m_grouped_gemm_fp8_fp8_bf16_nt',
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -105,11 +105,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
'GEMM_TYPE': 'GroupedContiguous'},
space=(),
includes=includes,
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
- ('out', torch.bfloat16),
- ('grouped_layout', torch.int32), ('m', int), ('num_groups', int),
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+ ('out', paddle.bfloat16),
+ ('grouped_layout', paddle.int32), ('m', int), ('num_groups', int),
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
template=template,
args=args
)
@@ -118,22 +118,22 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
runtime(*args)
-def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor],
- rhs: Tuple[torch.Tensor, torch.Tensor],
- out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None:
+def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+ rhs: Tuple[paddle.Tensor, paddle.Tensor],
+ out: paddle.Tensor, masked_m: paddle.Tensor, expected_m: int) -> None:
"""
Do a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
RHS and RHS scaling factors are required to be transposed.
The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
- this function will do a transposing with a set of slow PyTorch operations.
+ this function will do a transposing with a set of slow paddle operations.
Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch
should be separately transposed.
Arguments:
- lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
+ lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`.
- rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+ rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result.
masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute
@@ -149,21 +149,21 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
num_groups___ = masked_m.numel()
# Type and shape checks
- assert num_groups == num_groups_ == num_groups__ == num_groups___
- assert m == m_ and n == n_ and k == k_
- assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
- assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
- assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
- assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
- assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
- assert out.dtype == torch.bfloat16
- assert masked_m.dtype == torch.int32
- assert lhs.is_contiguous() and rhs.is_contiguous()
- assert out.is_contiguous() and masked_m.is_contiguous()
+ # assert num_groups == num_groups_ == num_groups__ == num_groups___
+ # assert m == m_ and n == n_ and k == k_
+ # assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
+ # assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
+ # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+ # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+ # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+ # assert out.dtype == paddle.bfloat16
+ # assert masked_m.dtype == paddle.int32
+ # assert lhs.is_contiguous() and rhs.is_contiguous()
+ # assert out.is_contiguous() and masked_m.is_contiguous()
# LHS scales must be transposed for TMA load, but not for RHS scales
lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
- assert rhs_scales.is_contiguous()
+ # assert rhs_scales.is_contiguous()
# Auto-tuning with compilation
global includes, template
@@ -176,7 +176,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
args = (lhs, lhs_scales, rhs, rhs_scales, out,
masked_m, m,
- torch.cuda.current_stream(), num_sms, smem_config[0])
+ paddle.device.cuda.current_stream(), num_sms, smem_config[0])
runtime = jit_tuner.compile_and_tune(
name='m_grouped_gemm_fp8_fp8_bf16_nt',
keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -189,11 +189,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
'GEMM_TYPE': 'GroupedMasked'},
space=(),
includes=includes,
- arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
- ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
- ('out', torch.bfloat16),
- ('grouped_layout', torch.int32), ('m', int),
- ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+ arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+ ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+ ('out', paddle.bfloat16),
+ ('grouped_layout', paddle.int32), ('m', int),
+ ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
template=template,
args=args
)
diff --git a/deep_gemm/jit_kernels/tuner.py b/deep_gemm/jit_kernels/tuner.py
index 6ed6749..9e1d70f 100644
--- a/deep_gemm/jit_kernels/tuner.py
+++ b/deep_gemm/jit_kernels/tuner.py
@@ -1,6 +1,6 @@
import copy
import os
-import torch
+import paddle
from typing import Any, Dict
from ..jit import build, cpp_format, generate, Runtime
@@ -51,10 +51,10 @@ class JITTuner:
continue
# Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
- start_event = torch.cuda.Event(enable_timing=True)
- end_event = torch.cuda.Event(enable_timing=True)
- torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
- torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+ start_event = paddle.device.cuda.Event(enable_timing=True)
+ end_event = paddle.device.cuda.Event(enable_timing=True)
+ paddle.empty((int(256e6 // 4)), dtype=paddle.int32).zero_()
+ paddle.randn((8192, 8192), dtype=paddle.float32) @ paddle.randn((8192, 8192), dtype=paddle.float32)
start_event.record()
for i in range(20):
assert runtime(*args) == 0
diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py
index c6da56b..a17b1b1 100644
--- a/deep_gemm/jit_kernels/utils.py
+++ b/deep_gemm/jit_kernels/utils.py
@@ -1,4 +1,4 @@
-import torch
+import paddle
_num_sms = None
@@ -11,7 +11,7 @@ def set_num_sms(num_sms: int) -> None:
num_sms: the desired maximum SM count for all GEMM kernels to use.
"""
global _num_sms
- assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count
+ assert 0 < num_sms <= paddle.device.cuda.get_device_properties().multi_processor_count
_num_sms = num_sms
@@ -25,7 +25,7 @@ def get_num_sms() -> int:
"""
global _num_sms
if _num_sms is None:
- _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
+ _num_sms = paddle.device.cuda.get_device_properties().multi_processor_count
return _num_sms
@@ -74,9 +74,9 @@ def get_tma_aligned_size(x: int, element_size: int) -> int:
return ceil_div(x, alignment) * alignment
-def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
+def get_col_major_tma_aligned_tensor(x: paddle.Tensor) -> paddle.Tensor:
"""
- Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary.
+ Returns TMA-aligned transposed format of the input tensor. `paddle.transpose` will be called if necessary.
If the input tensor is already column-major layout and 16-byte aligned along the M axis
(thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing.
@@ -92,18 +92,20 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
m, n = x.shape[-2], x.shape[-1]
aligned_m = get_tma_aligned_size(m, x.element_size())
if x.dim() == 2:
- if x.stride(0) == 1 and x.stride(1) == aligned_m:
+ if x.strides[0] == 1 and x.strides[1] == aligned_m:
return x
x, remove_dim = x.unsqueeze(0), True
b = x.shape[0]
# The last kernel gives a column-major TMA aligned layout
- if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m:
+ if x.strides[0] == aligned_m * n and x.strides[1] == 1 and x.strides[2] == aligned_m:
return x.squeeze(0) if remove_dim else x
# Normal layout requires transposing
- aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+ aligned_x = paddle.transpose(
+ paddle.empty((b, n, aligned_m), dtype=x.dtype), perm=[0, 2, 1]
+ )
aligned_x[:, :m, :] = x
aligned_x = aligned_x[:, :m, :]
return aligned_x.squeeze(0) if remove_dim else aligned_x
diff --git a/deep_gemm/paddle_utils.py b/deep_gemm/paddle_utils.py
new file mode 100644
index 0000000..2326807
--- /dev/null
+++ b/deep_gemm/paddle_utils.py
@@ -0,0 +1,20 @@
+import os
+
+def get_cuda_home():
+ """Get Cuda home directory"""
+ cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+ if cuda_home:
+ return cuda_home
+
+ try:
+ which_cmd = "which nvcc"
+
+ nvcc_path = os.popen(which_cmd).read().strip()
+ if nvcc_path:
+ return os.path.dirname(os.path.dirname(nvcc_path))
+ except Exception:
+ pass
+
+ return None
+
+CUDA_HOME = get_cuda_home()
\ No newline at end of file
diff --git a/deep_gemm/utils.py b/deep_gemm/utils.py
index d5cdd01..5237f09 100644
--- a/deep_gemm/utils.py
+++ b/deep_gemm/utils.py
@@ -1,15 +1,15 @@
import os
import sys
import time
-import torch
-import torch.distributed as dist
+import paddle
+import paddle.distributed as dist
def bench(fn, num_warmups: int = 5, num_tests: int = 10,
high_precision: bool = False):
# Flush L2 cache with 256 MB data
- torch.cuda.synchronize()
- cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+ paddle.device.cuda.synchronize()
+ cache = paddle.empty((int(256e6 // 4)), dtype=paddle.int32)
cache.zero_()
# Warmup
@@ -18,18 +18,18 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
# Add a large kernel to eliminate the CPU launch overhead
if high_precision:
- x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
- y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+ x = paddle.randn((8192, 8192), dtype=paddle.float32)
+ y = paddle.randn((8192, 8192), dtype=paddle.float32)
x @ y
# Testing
- start_event = torch.cuda.Event(enable_timing=True)
- end_event = torch.cuda.Event(enable_timing=True)
+ start_event = paddle.device.cuda.Event(enable_timing=True)
+ end_event = paddle.device.cuda.Event(enable_timing=True)
start_event.record()
for i in range(num_tests):
fn()
end_event.record()
- torch.cuda.synchronize()
+ paddle.device.synchronize()
return start_event.elapsed_time(end_event) / num_tests
@@ -106,21 +106,21 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
# Profile
suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
with suppress():
- schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
- profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
+ scheduler = paddle.profiler.make_scheduler(closed=0, ready=1, record=1, repeat=1) if not using_nsys else None
+ profiler = paddle.profiler.Profiler(targets=[paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU], scheduler=scheduler) if not using_nsys else empty_suppress()
with profiler:
for i in range(2):
# NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
if barrier_comm_profiling:
- lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
- rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+ lhs = paddle.randn((8192, 8192), dtype=paddle.float32)
+ rhs = paddle.randn((8192, 8192), dtype=paddle.float32)
lhs @ rhs
- dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
+ dist.all_reduce(paddle.ones(1, dtype=paddle.float32))
for _ in range(num_tests):
if sleep_between_tests > 0.0:
time.sleep(sleep_between_tests)
if flush_l2:
- torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+ paddle.empty(flush_l2_size, dtype=paddle.int32).zero_()
fn()
if not using_nsys:
--
2.43.0

View File

@@ -1,188 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dtype.h"
#include "matmul_helper.h"
#include "my_types.h"
#include "paddle/extension.h"
#include "paddle/phi/core/kernel_registry.h"
template <typename T>
void AvxCompute(const paddle::Tensor &x,
const paddle::Tensor &weight,
const paddle::Tensor &w_bias,
bool trans,
const std::string alog,
paddle::Tensor &out,
xft::Matrix<T> &quantizedWeight,
xft::Vector<float> &WeightScale,
xft::Vector<float> &WeightZero,
xft::Vector<float> &WeightSum,
MMHelper *mmHelper) {
auto out_data = out.data<float>();
const float *x_data = reinterpret_cast<const float *>(x.data<float>());
const float *bias_data = nullptr;
if (w_bias.initialized()) {
bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
}
int m = 1;
for (int i = 0; i < x.shape().size() - 1; i++) {
m = m * x.shape()[i];
}
int k = x.shape()[x.shape().size() - 1];
int l = weight.shape()[1];
int n = weight.shape()[1];
if (w_bias.initialized()) {
mmHelper->compute_bias(false,
m,
n,
k,
1.0f,
x_data,
k,
quantizedWeight.Data(),
WeightScale.Data(),
WeightZero.Data(),
WeightSum.Data(),
0.0f,
out_data,
l,
bias_data);
} else {
mmHelper->compute(false,
m,
n,
k,
1.0f,
x_data,
k,
quantizedWeight.Data(),
WeightScale.Data(),
WeightZero.Data(),
WeightSum.Data(),
0.0,
out_data,
l);
}
};
template <typename T>
void AvxWeightOnly(const paddle::Tensor &x,
const paddle::Tensor &weight,
const paddle::Tensor &w_bias,
bool trans,
const std::string alog,
paddle::Tensor &out) {
static std::unordered_map<std::string,
std::tuple<xft::Matrix<T> *,
xft::Vector<float> *,
xft::Vector<float> *,
xft::Vector<float> *>>
weight_only_hub;
std::stringstream weights_addr;
weights_addr << weight.data<float>() << alog;
std::string weight_only_key = weights_addr.str();
auto it_created = weight_only_hub.find(weight_only_key);
static MMHelper *mmHelper;
int rows = weight.shape()[0], cols = weight.shape()[1];
xft::Vector<float> *WeightScale =
new xft::Vector<float>(); // if weight is int8
xft::Vector<float> *WeightZero =
new xft::Vector<float>(); // if weight is int8
xft::Vector<float> *WeightSum =
new xft::Vector<float>(); // if weight is int8
xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
if (it_created == weight_only_hub.end()) {
auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
xft::Matrix<T> convertedWeight;
mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
mmHelper->convertWeight(trans,
rows,
cols,
weight_ptr,
nullptr,
nullptr,
convertedWeight,
*WeightScale,
*WeightZero,
*WeightSum);
quantizedWeight->Resize(rows, cols);
mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
weight_only_hub[weight_only_key] = std::make_tuple(
quantizedWeight, WeightScale, WeightZero, WeightSum);
AvxCompute<T>(x,
weight,
w_bias,
trans,
alog,
out,
*quantizedWeight,
*WeightScale,
*WeightZero,
*WeightSum,
mmHelper);
} else {
AvxCompute<T>(x,
weight,
w_bias,
trans,
alog,
out,
*(std::get<0>(it_created->second)),
*(std::get<1>(it_created->second)),
*(std::get<2>(it_created->second)),
*(std::get<3>(it_created->second)),
mmHelper);
}
}
std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
const paddle::Tensor &weight,
const paddle::Tensor &w_bias,
const std::string &alog,
bool trans) {
auto out_shape = x.shape();
out_shape[out_shape.size() - 1] = weight.shape()[1];
auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
if (alog == "int8") {
AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
} else if (alog == "fp16") {
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
} else {
AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
}
return {out};
}
std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
std::vector<int64_t> x_shape,
std::vector<int64_t> weigh_shape,
std::vector<int64_t> weigh_bias_shape) {
int m = 1;
for (int i = 0; i < x_shape.size() - 1; i++) {
m = m * x_shape[i];
}
return {std::vector<int64_t>{m, weigh_shape[1]}};
}
std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
paddle::DataType x_dtype,
paddle::DataType weight_dtype,
paddle::DataType weight_bias_dtype) {
return {x_dtype};
}
PD_BUILD_STATIC_OP(avx_weight_only)
.Inputs({"x", "weight", "w_bias"})
.Outputs({"out"})
.Attrs({"alog: std::string", "trans:bool"})
.SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
.SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));

View File

@@ -0,0 +1,268 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/extension.h"
#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif
template <typename T>
void RebuildPaddingCPUImpl(T *output_data,
const T *input_data,
const int *cum_offsets_data,
const int *seq_len_this_time_data,
const int *seq_lens_decoder_data,
const int *seq_lens_encoder_data,
int max_input_length,
int dim_embed,
const int elem_nums) {
for (int i = 0; i < elem_nums; ++i) {
const int bi = i / dim_embed;
const int bias_idx = i % dim_embed;
int seq_id = 0;
if (seq_len_this_time_data[bi] == 0) {
continue;
}
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
continue;
}
if (seq_lens_encoder_data[bi] > 0) {
seq_id = seq_lens_encoder_data[bi] - 1;
}
const int ori_token_idx =
bi * max_input_length - cum_offsets_data[bi] + seq_id;
const int src_offset = ori_token_idx * dim_embed + bias_idx;
output_data[i] = input_data[src_offset];
}
}
template <typename T>
void RebuildAppendPaddingCPUImpl(T *output_data,
const T *input_data,
const int *cum_offsets_data,
const int *seq_len_this_time_data,
const int *seq_lens_decoder_data,
const int *seq_lens_encoder_data,
const int *output_padding_offset_data,
const int max_input_length,
const int dim_embed,
const int64_t output_elem_nums) {
for (int i = 0; i < output_elem_nums; ++i) {
int out_token_id = i / dim_embed;
int ori_token_id =
out_token_id + output_padding_offset_data[out_token_id];
int bi = ori_token_id / max_input_length;
if (seq_len_this_time_data[bi] == 0 ||
(seq_lens_decoder_data[bi] == 0 &&
seq_lens_encoder_data[bi] == 0)) {
continue;
}
int seq_id = 0;
if (seq_lens_encoder_data[bi] > 0) {
seq_id = seq_lens_encoder_data[bi] - 1;
}
int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
int bias_idx = i % dim_embed;
int src_offset = input_token_id * dim_embed + bias_idx;
output_data[i] = input_data[src_offset];
}
}
std::vector<paddle::Tensor> RebuildPaddingCPU(
const paddle::Tensor &tmp_out,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &seq_len_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_encoder,
const paddle::optional<paddle::Tensor> &output_padding_offset,
int max_input_length) {
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
auto seq_len_this_time_cpu =
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
auto seq_lens_decoder_cpu =
seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
auto seq_lens_encoder_cpu =
seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
paddle::optional<paddle::Tensor> output_padding_offset_cpu;
if (output_padding_offset) {
output_padding_offset_cpu =
output_padding_offset->copy_to(paddle::CPUPlace(), true);
}
int token_num = tmp_out_cpu.shape()[0];
int dim_embed = tmp_out_cpu.shape()[1];
int bsz = cum_offsets_cpu.shape()[0];
paddle::Tensor out;
if (output_padding_offset_cpu) {
int need_delete_token_num = 0;
for (int i = 0; i < bsz; ++i) {
if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
need_delete_token_num +=
seq_lens_encoder_cpu.data<int>()[i] - 1;
}
}
int output_token_num = token_num - need_delete_token_num;
out = paddle::full({output_token_num, dim_embed},
0,
tmp_out_cpu.dtype(),
paddle::CPUPlace());
} else {
out = paddle::full(
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
}
const int *cum_offsets_data = cum_offsets_cpu.data<int>();
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
int elem_nums = out.numel();
if (output_padding_offset_cpu) {
const int *output_padding_offset_data =
output_padding_offset_cpu->data<int>();
switch (tmp_out_cpu.dtype()) {
case paddle::DataType::FLOAT32:
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
tmp_out_cpu.data<float>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
output_padding_offset_data,
max_input_length,
dim_embed,
elem_nums);
break;
case paddle::DataType::FLOAT16:
RebuildAppendPaddingCPUImpl<paddle::float16>(
out.data<paddle::float16>(),
tmp_out_cpu.data<paddle::float16>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
output_padding_offset_data,
max_input_length,
dim_embed,
elem_nums);
break;
case paddle::DataType::BFLOAT16:
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
out.data<paddle::bfloat16>(),
tmp_out_cpu.data<paddle::bfloat16>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
output_padding_offset_data,
max_input_length,
dim_embed,
elem_nums);
break;
default:
PD_THROW(
"Unsupported data type for rebuild_padding_cpu. "
"Only float32, float16, and bfloat16 are supported.");
}
} else {
switch (tmp_out_cpu.dtype()) {
case paddle::DataType::FLOAT32:
RebuildPaddingCPUImpl<float>(out.data<float>(),
tmp_out_cpu.data<float>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
max_input_length,
dim_embed,
elem_nums);
break;
case paddle::DataType::FLOAT16:
RebuildPaddingCPUImpl<paddle::float16>(
out.data<paddle::float16>(),
tmp_out_cpu.data<paddle::float16>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
max_input_length,
dim_embed,
elem_nums);
break;
case paddle::DataType::BFLOAT16:
RebuildPaddingCPUImpl<paddle::bfloat16>(
out.data<paddle::bfloat16>(),
tmp_out_cpu.data<paddle::bfloat16>(),
cum_offsets_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
max_input_length,
dim_embed,
elem_nums);
break;
default:
PD_THROW(
"Unsupported data type for rebuild_padding_cpu. "
"Only float32, float16, and bfloat16 are supported.");
}
}
return {out};
}
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
const std::vector<int64_t> &tmp_out_shape,
const std::vector<int64_t> &cum_offsets_shape,
const std::vector<int64_t> &seq_len_this_time_shape,
const std::vector<int64_t> &seq_lens_decoder_shape,
const std::vector<int64_t> &seq_lens_encoder_shape,
const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
int64_t dim_embed = tmp_out_shape[1];
if (output_padding_offset_shape) {
return {{-1, dim_embed}};
} else {
int64_t bsz = cum_offsets_shape[0];
return {{bsz, dim_embed}};
}
}
std::vector<paddle::DataType> RebuildPaddingInferDtype(
const paddle::DataType &tmp_out_dtype,
const paddle::DataType &cum_offsets_dtype,
const paddle::DataType &seq_len_this_time_dtype,
const paddle::DataType &seq_lens_decoder_dtype,
const paddle::DataType &seq_lens_encoder_dtype,
const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
return {tmp_out_dtype};
}
PD_BUILD_STATIC_OP(rebuild_padding_cpu)
.Inputs({"tmp_out",
"cum_offsets",
"seq_len_this_time",
"seq_lens_decoder",
"seq_lens_encoder",
paddle::Optional("output_padding_offset")})
.Outputs({"out"})
.Attrs({"max_input_length: int"})
.SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
.SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));

View File

@@ -1,201 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "layers_decoder.h"
#include "paddle/extension.h"
#include "paddle/phi/core/kernel_registry.h"
std::vector<paddle::Tensor> InvokeAllLLaMALayer(
const paddle::Tensor &input,
const std::vector<paddle::Tensor> &ln1Gamma,
const std::vector<paddle::Tensor> &ln1Beta,
const std::vector<paddle::Tensor> &qkvWeight,
const std::vector<paddle::Tensor> &qkvBiasWeight,
const std::vector<paddle::Tensor> &attnOutWeight,
const std::vector<paddle::Tensor> &attnOutBias,
const std::vector<paddle::Tensor> &ln2Gamma,
const std::vector<paddle::Tensor> &ln2Beta,
const std::vector<paddle::Tensor> &gateWeight,
const std::vector<paddle::Tensor> &gateBias,
const std::vector<paddle::Tensor> &upWeight,
const std::vector<paddle::Tensor> &upBias,
const std::vector<paddle::Tensor> &downWeight,
const std::vector<paddle::Tensor> &downBias,
const paddle::Tensor &pastSeqLen,
const paddle::Tensor &currentSeqLen,
const paddle::Tensor &step,
int hiddensize,
int totalLayer,
const std::string &computeType,
const std::string &activation,
const std::string &normType,
int attHeadDim,
int attHeadNum,
int kvHeadNum,
int maxPositions,
int maxPosEmbed,
int intermediateSize) {
auto out = paddle::empty_like(input);
auto batchSize = input.shape()[0];
auto inputSeqLen = input.shape()[1];
auto past_seq_len = pastSeqLen.data<int64_t>()[0];
auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
auto step_id = step.data<int64_t>()[0];
auto output_ptr = reinterpret_cast<void *>(out.data<float>());
auto xft_data_type = xft::DataType::fp16;
if (computeType == "bf16") {
xft_data_type = xft::DataType::bf16;
} else if (computeType == "bf16_int8") {
xft_data_type = xft::DataType::bf16_int8;
}
auto xft_act_type = xft::ActivationType::SILU;
if (activation == "relu") {
xft_act_type = xft::ActivationType::RELU;
} else if (activation == "gelu") {
xft_act_type = xft::ActivationType::GELU;
} else if (activation == "swiglu") {
xft_act_type = xft::ActivationType::SWIGLU;
}
auto xft_norm_type = xft::NormType::RMS;
if (normType == "layernorm") {
xft_norm_type = xft::NormType::LN;
}
auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
for (int i = 0; i < totalLayer; ++i) {
auto ln1Gamma_ptr =
reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
auto ln1Beta_ptr =
reinterpret_cast<const float *>(ln1Beta[i].data<float>());
auto qkvWeight_ptr =
reinterpret_cast<const void *>(qkvWeight[i].data<float>());
auto qkvBiasWeight_ptr =
reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
auto attnOutWeight_ptr =
reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
auto ln2Gamma_ptr =
reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
auto ln2Beta_ptr =
reinterpret_cast<const float *>(ln2Beta[i].data<float>());
auto gate_weight_ptr =
reinterpret_cast<const void *>(gateWeight[i].data<float>());
auto up_weight_ptr =
reinterpret_cast<const void *>(upWeight[i].data<float>());
auto down_weight_ptr =
reinterpret_cast<const void *>(downWeight[i].data<float>());
auto gate_bias_ptr =
reinterpret_cast<const float *>(gateBias[i].data<float>());
auto up_bias_ptr =
reinterpret_cast<const float *>(upBias[i].data<float>());
auto down_bias_ptr =
reinterpret_cast<const float *>(downBias[i].data<float>());
auto attnOutBias_ptr =
reinterpret_cast<const float *>(attnOutBias[i].data<float>());
invokeLayerLLaMA(
xft_data_type, // dt
xft_act_type, // at
xft_norm_type, // nt
i, // layerId
totalLayer, // totalLayers
batchSize, // batchSize
inputSeqLen, // inputSeqLen
attHeadDim, // attHeadDim
attHeadNum, // attHeadNum
kvHeadNum, // kvHeadNum
maxPositions, // maxPositions
maxPosEmbed, // maxPosEmbed
past_seq_len, // pastSeqLen
cur_seq_len, // currentSeqLen
step_id, // step
hiddensize, // hiddenSize
intermediateSize, // intermediateSize
reinterpret_cast<void *>(output_ptr), // output
hiddensize, // outputStride
input_ptr, // input
hiddensize, // inputStride
ln1Gamma_ptr, // ln1Gamma
ln1Beta_ptr, // ln1Beta
qkvWeight_ptr, // queryWeight
qkvWeight_ptr + hiddensize, // keyWeight
qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim, // valueWeight
attnOutWeight_ptr, // attnOutWeight
ln2Gamma_ptr, // ln2Gamma
ln2Beta_ptr, // ln2Beta
gate_weight_ptr,
up_weight_ptr,
down_weight_ptr,
qkvBiasWeight_ptr, // queryBias
qkvBiasWeight_ptr + hiddensize, // keyBias
qkvBiasWeight_ptr + hiddensize +
kvHeadNum * attHeadDim, // valueBias
attnOutBias_ptr, // attnOutBias
qkvWeight_ptr, // myqkvWeight
gate_bias_ptr,
up_bias_ptr,
down_bias_ptr,
qkvBiasWeight_ptr);
if (i < totalLayer - 1) {
memcpy(const_cast<void *>(input_ptr),
output_ptr,
batchSize * inputSeqLen * hiddensize * sizeof(float));
}
}
return {out};
}
std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
std::vector<int64_t> x_shape) {
return {x_shape};
}
std::vector<paddle::DataType> AllLLaMALayerInferDtype(
paddle::DataType x_dtype) {
return {x_dtype};
}
PD_BUILD_STATIC_OP(xft_llama_all_layer)
.Inputs({
"x",
paddle::Vec("ln1Gamma"),
paddle::Vec("ln1Beta"),
paddle::Vec("qkvWeight"),
paddle::Vec("qkvBiasWeight"),
paddle::Vec("attnOutWeight"),
paddle::Vec("attnOutBias"),
paddle::Vec("ln2Gamma"),
paddle::Vec("ln2Beta"),
paddle::Vec("gateWeight"),
paddle::Vec("gateBias"),
paddle::Vec("upWeight"),
paddle::Vec("upBias"),
paddle::Vec("downWeight"),
paddle::Vec("downBias"),
"pastSeqLen",
"currentSeqLen",
"step",
})
.Outputs({"out"})
.Attrs({"hiddensize :int",
"totalLayer :int",
"computeType : std::string",
"activation :std::string",
"normType :std::string",
"attHeadDim: int",
"attHeadNum: int",
"kvHeadNum: int",
"maxPositions: int",
"maxPosEmbed: int",
"intermediateSize: int"})
.SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
.SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));

View File

@@ -1,126 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <omp.h>
#include <cstdio>
#include <iostream>
#include "paddle/extension.h"
void greedy_search(const float *probs,
int64_t *next_token_ids,
int bsz,
int vocab_size) {
int numThreads = 0;
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (tid == 0) {
numThreads = omp_get_num_threads();
}
}
float maxVals[bsz];
// Small batch size (each sample can have at least 2 threads)
if (numThreads / bsz >= 2) {
int thrPerSample = numThreads / bsz;
int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
int maxIndices[bsz * thrPerSample];
float maxValues[bsz * thrPerSample];
// TODO: if size is small, possible to cause out of boundary
#pragma omp parallel for collapse(2)
for (int b = 0; b < bsz; ++b) {
for (int t = 0; t < thrPerSample; ++t) {
int start = t * sizePerThr;
int end = (start + sizePerThr) > vocab_size
? vocab_size
: (start + sizePerThr);
const float *p = probs + b * vocab_size;
int maxIdx = start;
float maxVal = p[start];
for (int off = start + 1; off < end; ++off) {
if (p[off] > maxVal) {
maxVal = p[off];
maxIdx = off;
}
}
// False sharing happens, but since only one time, not avoided
maxIndices[b * thrPerSample + t] = maxIdx;
maxValues[b * thrPerSample + t] = maxVal;
}
}
// Local reduction
for (int i = 0; i < bsz; ++i) {
int *pIndices = maxIndices + i * thrPerSample;
float *pValues = maxValues + i * thrPerSample;
int maxIdx = pIndices[0];
float maxVal = pValues[0];
for (int j = 1; j < thrPerSample; ++j) {
if (pValues[j] > maxVal) {
maxVal = pValues[j];
maxIdx = pIndices[j];
}
}
next_token_ids[i] = maxIdx;
maxVals[i] = maxVal;
}
}
// Each thread handle one sample (one row)
else {
#pragma omp parallel for
for (int i = 0; i < bsz; ++i) {
int maxId = 0;
const float *p = probs + i * vocab_size;
float maxVal = p[0];
for (int j = 1; j < vocab_size; ++j) {
if (p[j] > maxVal) {
maxVal = p[j];
maxId = j;
}
}
next_token_ids[i] = maxId;
maxVals[i] = maxVal;
}
}
return;
}
std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
const int bsz = probs.shape()[0];
const int vocab_size = probs.shape()[1];
auto next_tokens =
paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
greedy_search(probs.data<float>(),
const_cast<int64_t *>(next_tokens.data<int64_t>()),
bsz,
vocab_size);
return {next_tokens};
}
std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
const std::vector<int64_t> &probs_shape) {
int64_t bsz = probs_shape[0];
return {{bsz, 1}};
}
std::vector<paddle::DataType> XftGreedySearchInferDtype(
const paddle::DataType &probs_dtype) {
return {paddle::DataType::INT64};
}
PD_BUILD_STATIC_OP(xft_greedy_search)
.Inputs({"probs"})
.Outputs({"next_tokens_ids"})
.SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
.SetKernelFn(PD_KERNEL(XftGreedySearch));

File diff suppressed because it is too large Load Diff

View File

@@ -17,15 +17,12 @@
#include "paddle/phi/core/memory/memcpy.h"
template <int THREADBLOCK_SIZE>
__global__ void GetMaxLenKernel(const int *seq_lens,
const int *seq_lens_this_time,
const int *seq_lens_encoder,
const int *seq_lens_this_time_merged,
const int *seq_lens_encoder_merged,
const int *seq_mapping,
const int *system_lens,
int *max_lens,
const int batch_size) {
__global__ void
GetMaxLenKernel(const int *seq_lens, const int *seq_lens_this_time,
const int *seq_lens_encoder,
const int *seq_lens_this_time_merged,
const int *seq_lens_encoder_merged, const int *seq_mapping,
const int *system_lens, int *max_lens, const int batch_size) {
const int tid = threadIdx.x;
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
@@ -41,43 +38,61 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
int max_dec_len_without_system_this_thread = 0;
for (int i = tid; i < batch_size; i += blockDim.x) {
const int seq_len_this_time = seq_lens_this_time[i];
max_len_this_time_this_thread = max(seq_len_this_time,
max_len_this_time_this_thread);
max_len_encoder_this_thread = max(seq_lens_encoder[i],
max_len_encoder_this_thread);
max_len_this_time_this_thread =
max(seq_len_this_time, max_len_this_time_this_thread);
max_len_encoder_this_thread =
max(seq_lens_encoder[i], max_len_encoder_this_thread);
max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
if (seq_len_this_time <= 0) continue;
if (seq_len_this_time <= 0)
continue;
const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
max_len_this_thread = max(seq_lens[i] + seq_len_this_time,
max_len_this_thread);
max_just_dec_len_this_thread = max(max_just_dec_len_this_thread,
max_just_dec_len_now);
max_len_this_thread =
max(seq_lens[i] + seq_len_this_time, max_len_this_thread);
max_just_dec_len_this_thread =
max(max_just_dec_len_this_thread, max_just_dec_len_now);
if (system_lens) {
const int real_bid = seq_mapping[i];
const int system_len_now = system_lens[real_bid];
max_system_len_this_thread = max(max_system_len_this_thread, system_len_now);
max_dec_len_without_system_this_thread = max(max_dec_len_without_system_this_thread,
max_just_dec_len_now - system_len_now);
max_system_len_this_thread =
max(max_system_len_this_thread, system_len_now);
max_dec_len_without_system_this_thread =
max(max_dec_len_without_system_this_thread,
max_just_dec_len_now - system_len_now);
}
}
if (system_lens) {
for (int i = tid; i < batch_size; i += blockDim.x) {
const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
if (ori_seq_len_this_time <= 0) continue;
const int max_just_dec_merged_len_this_time_now = seq_lens_encoder_merged[i] > 0 ?
0 : ori_seq_len_this_time;
max_just_dec_merged_len_this_time_this_thread = max(max_just_dec_merged_len_this_time_this_thread,
max_just_dec_merged_len_this_time_now);
if (ori_seq_len_this_time <= 0)
continue;
const int max_just_dec_merged_len_this_time_now =
seq_lens_encoder_merged[i] > 0 ? 0 : ori_seq_len_this_time;
max_just_dec_merged_len_this_time_this_thread =
max(max_just_dec_merged_len_this_time_this_thread,
max_just_dec_merged_len_this_time_now);
}
}
int total_max_len_this_time = BlockReduce(temp_storage).Reduce(max_len_this_time_this_thread, MaxOp<int>());
int total_max_len_encoder = BlockReduce(temp_storage).Reduce(max_len_encoder_this_thread, MaxOp<int>());
int total_max_len_decoder = BlockReduce(temp_storage).Reduce(max_len_decoder_this_thread, MaxOp<int>());
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
int total_just_dec = BlockReduce(temp_storage).Reduce(max_just_dec_len_this_thread, MaxOp<int>());
int total_just_dec_merged = BlockReduce(temp_storage).Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
int total_system_len = BlockReduce(temp_storage).Reduce(max_system_len_this_thread, MaxOp<int>());
int total_dec_len_without_system = BlockReduce(temp_storage).Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
int total_max_len_this_time =
BlockReduce(temp_storage)
.Reduce(max_len_this_time_this_thread, MaxOp<int>());
int total_max_len_encoder =
BlockReduce(temp_storage)
.Reduce(max_len_encoder_this_thread, MaxOp<int>());
int total_max_len_decoder =
BlockReduce(temp_storage)
.Reduce(max_len_decoder_this_thread, MaxOp<int>());
int total =
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
int total_just_dec = BlockReduce(temp_storage)
.Reduce(max_just_dec_len_this_thread, MaxOp<int>());
int total_just_dec_merged =
BlockReduce(temp_storage)
.Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
int total_system_len = BlockReduce(temp_storage)
.Reduce(max_system_len_this_thread, MaxOp<int>());
int total_dec_len_without_system =
BlockReduce(temp_storage)
.Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
if (tid == 0) {
max_lens[0] = total_max_len_this_time;
max_lens[1] = total_max_len_encoder;
@@ -90,30 +105,22 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
}
}
void GetMaxLen(const paddle::Tensor& seq_lens_tensor,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
paddle::Tensor &max_len_tensor,
const int batch_size) {
void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
paddle::Tensor &max_len_tensor, const int batch_size) {
constexpr int blockSize = 1024;
GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
seq_lens_tensor.data<int>(),
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
nullptr,
nullptr,
nullptr,
nullptr,
max_len_tensor.data<int>(),
batch_size);
seq_lens_tensor.data<int>(), seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(), nullptr, nullptr, nullptr, nullptr,
max_len_tensor.data<int>(), batch_size);
}
__global__ void split_q_block(const int* __restrict__ seq_lens_q,
const int* __restrict__ seq_lens_encoder,
int* __restrict__ batch_ids,
int* __restrict__ tile_ids_per_batch,
int* __restrict__ num_blocks_x,
const int bsz,
__global__ void split_q_block(const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_encoder,
int *__restrict__ batch_ids,
int *__restrict__ tile_ids_per_batch,
int *__restrict__ num_blocks_x, const int bsz,
const int num_rows_per_block,
const int group_size) {
if (threadIdx.x == 0) {
@@ -124,8 +131,7 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
seq_len = 0;
}
const int loop_times =
div_up(seq_len * group_size, num_rows_per_block);
const int loop_times = div_up(seq_len * group_size, num_rows_per_block);
for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
batch_ids[index] = bid;
tile_ids_per_batch[index++] = tile_id;
@@ -136,14 +142,12 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
}
}
__global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
const int* __restrict__ seq_lens_encoder,
int* __restrict__ batch_ids,
int* __restrict__ tile_ids_per_batch,
int* __restrict__ num_blocks_x,
const int bsz,
const int pad_len,
const int num_row_per_block) {
__global__ void split_kv_block(const int *__restrict__ seq_lens_decoder,
const int *__restrict__ seq_lens_encoder,
int *__restrict__ batch_ids,
int *__restrict__ tile_ids_per_batch,
int *__restrict__ num_blocks_x, const int bsz,
const int pad_len, const int num_row_per_block) {
if (threadIdx.x == 0) {
int gridx = 0;
int index = 0;
@@ -165,50 +169,46 @@ __global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
}
template <int THREADBLOCK_SIZE>
__global__ void get_max_len_kv_ernel(int* max_seq_lens_out,
const int* seq_lens_this_time,
const int* seq_lens_decoder,
const int batch_size) {
__global__ void
get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
const int *seq_lens_decoder, const int batch_size) {
const int tid = threadIdx.x;
typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int max_len_this_thread = 0;
for (int i = tid; i < batch_size; i += blockDim.x) {
if (seq_lens_decoder[i] == 0) continue;
max_len_this_thread = max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
if (seq_lens_decoder[i] == 0)
continue;
max_len_this_thread =
max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
}
int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
int total =
BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
if (tid == 0) {
*max_seq_lens_out = total;
}
}
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cum_offsets,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int group_size,
const int block_size,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int group_size, const int block_size,
const int decoder_step_token_num) {
auto stream = seq_lens_encoder.stream();
int bsz = cum_offsets.shape()[0];
auto max_len_tensor =
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
GetMaxLen(
seq_lens_decoder,
seq_lens_this_time,
seq_lens_encoder,
max_len_tensor,
bsz);
GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
max_len_tensor, bsz);
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time, max_enc_dec_len_this_time,
// max_just_dec_len_this_time, max_just_dec_merged_len_this_time, max_system_len, max_just_dec_len_without_system
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
// max_enc_dec_len_this_time, max_just_dec_len_this_time,
// max_just_dec_merged_len_this_time, max_system_len,
// max_just_dec_len_without_system
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
auto max_len_cpu_ptr = max_len_cpu.data<int>();
int max_len_this_time = max_len_cpu_ptr[0];
@@ -229,67 +229,67 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
paddle::Tensor decoder_batch_ids;
paddle::Tensor decoder_tile_ids_per_batch;
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
paddle::Tensor max_len_kv_cpu; /*cpu*/
paddle::Tensor max_len_kv_cpu; /*cpu*/
auto max_len_kv =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
max_len_kv.data<int>(),
seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(),
bsz
);
max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(), bsz);
max_len_kv_cpu =
max_len_kv.copy_to(paddle::CPUPlace(), false);
max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
if (max_enc_len_this_time > 0) {
const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
paddle::DataType::INT32,
seq_lens_encoder.place());
const uint32_t max_tile_size_per_bs_kv =
div_up(max_enc_dec_len_this_time, block_size);
kv_batch_ids =
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
seq_lens_encoder.place());
kv_tile_ids_per_batch =
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
seq_lens_encoder.place());
auto kv_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
seq_lens_decoder.data<int>(),
// sequence_lengths->data<int>(),
seq_lens_encoder.data<int>(),
kv_batch_ids.data<int>(),
kv_tile_ids_per_batch.data<int>(),
kv_num_blocks_x.data<int>(),
bsz,
block_size,
block_size
);
seq_lens_decoder.data<int>(),
// sequence_lengths->data<int>(),
seq_lens_encoder.data<int>(), kv_batch_ids.data<int>(),
kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
block_size, block_size);
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
const uint32_t encoder_max_tile_size_per_bs_q = div_up(
(max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
const uint32_t encoder_max_tile_size_per_bs_q =
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
encoder_batch_ids =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
paddle::DataType::INT32, seq_lens_encoder.place());
encoder_tile_ids_per_batch =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
paddle::DataType::INT32, seq_lens_encoder.place());
auto encoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
nullptr,
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
encoder_batch_ids.data<int>(),
encoder_tile_ids_per_batch.data<int>(),
encoder_num_blocks_x.data<int>(),
bsz,
encoder_block_shape_q,
group_size);
encoder_num_blocks_x.data<int>(), bsz,
encoder_block_shape_q, group_size);
encoder_num_blocks_x_cpu =
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
encoder_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
encoder_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
encoder_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
kv_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
kv_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
kv_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
}
if (max_just_dec_len_this_time > 0) {
const uint32_t decoder_max_tile_size_per_bs_q =
@@ -297,24 +297,26 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
decoder_batch_ids =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
paddle::DataType::INT32, seq_lens_encoder.place());
decoder_tile_ids_per_batch =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32,
seq_lens_encoder.place());
paddle::DataType::INT32, seq_lens_encoder.place());
auto decoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(),
bsz,
decoder_block_shape_q,
group_size);
split_q_block<<<1, 32, 0, stream>>>(
seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
group_size);
decoder_num_blocks_x_cpu =
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
decoder_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
decoder_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
decoder_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
}
return {encoder_batch_ids,
@@ -331,28 +333,22 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
}
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
const paddle::DataType& seq_lens_encoder_dtype,
const paddle::DataType& seq_lens_decoder_dtype,
const paddle::DataType& seq_lens_this_time_dtype,
const paddle::DataType& cum_offsets_dtype) {
return {paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT32};
const paddle::DataType &seq_lens_encoder_dtype,
const paddle::DataType &seq_lens_decoder_dtype,
const paddle::DataType &seq_lens_this_time_dtype,
const paddle::DataType &cum_offsets_dtype) {
return {
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32};
}
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
const std::vector<int64_t>& seq_lens_encoder_shape,
const std::vector<int64_t>& seq_lens_decoder_shape,
const std::vector<int64_t>& seq_lens_this_time_shape,
const std::vector<int64_t>& cum_offsets_shape) {
const std::vector<int64_t> &seq_lens_encoder_shape,
const std::vector<int64_t> &seq_lens_decoder_shape,
const std::vector<int64_t> &seq_lens_this_time_shape,
const std::vector<int64_t> &cum_offsets_shape) {
std::vector<int64_t> dynamic_shape = {-1};
return {dynamic_shape,
@@ -369,9 +365,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
}
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
.Inputs({"seq_lens_encoder",
"seq_lens_decoder",
"seq_lens_this_time",
.Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
"cum_offsets"})
.Outputs({paddle::Optional("encoder_batch_ids"),
paddle::Optional("encoder_tile_ids_per_batch"),
@@ -382,12 +376,9 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
paddle::Optional("decoder_batch_ids"),
paddle::Optional("decoder_tile_ids_per_batch"),
paddle::Optional("decoder_num_blocks"),
paddle::Optional("max_len_kv"),
"set_max_lengths"})
.Attrs({"encoder_block_shape_q: int",
"decoder_block_shape_q: int",
"group_size: int",
"block_size: int",
paddle::Optional("max_len_kv"), "set_max_lengths"})
.Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
"group_size: int", "block_size: int",
"decoder_step_token_num: int"})
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))

View File

@@ -337,6 +337,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
} else if (deal_each_time == 64) { \
constexpr size_t DEAL_EACH_TIME = 64; \
__VA_ARGS__ \
} else { \
PD_THROW("not support the deal_each_time", deal_each_time); \
}
#define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
@@ -346,6 +348,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
} else if (num_threads == 256) { \
constexpr size_t NUM_THREADS = 256; \
__VA_ARGS__ \
} else { \
PD_THROW("not support the num_threads", num_threads); \
}
#define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
@@ -376,6 +380,11 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
} else if (group_size == 12) { \
constexpr size_t GROUP_SIZE = 12; \
__VA_ARGS__ \
} else if (group_size == 16) { \
constexpr size_t GROUP_SIZE = 16; \
__VA_ARGS__ \
} else { \
PD_THROW("not support the group_size", group_size); \
}
#define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \

View File

@@ -13,7 +13,7 @@
// limitations under the License.
#include "paddle/extension.h"
#include "pybind11/pybind11.h"
namespace py = pybind11;
// 自定义异常类用于处理CUDA错误
@@ -125,45 +125,40 @@ paddle::Tensor FusedExpertMoeFunc(
const bool norm_topk_prob, const bool group_moe);
std::vector<paddle::Tensor> MoeExpertDispatch(
const paddle::Tensor& input,
const paddle::Tensor& gating_output,
const paddle::optional<paddle::Tensor>& gating_correction_bias,
const paddle::optional<paddle::Tensor> &w4a8_in_scale,
const int moe_topk,
const bool group_moe,
const bool topk_only_mode);
const paddle::Tensor &input, const paddle::Tensor &gating_output,
const paddle::optional<paddle::Tensor> &gating_correction_bias,
const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
const bool group_moe, const bool topk_only_mode);
std::vector<paddle::Tensor>
MoETopKSelectKernel(const paddle::Tensor &gating_logits,
const paddle::optional<paddle::Tensor> &bias,
const int moe_topk, const bool apply_norm_weight,
const bool enable_softmax_top_k_fused);
const paddle::optional<paddle::Tensor> &bias,
const int moe_topk, const bool apply_norm_weight,
const bool enable_softmax_top_k_fused);
std::vector<paddle::Tensor> MoERedundantTopKSelectKernel(
const paddle::Tensor& gating_logits,
const paddle::Tensor& expert_id_to_ep_rank_array,
const paddle::Tensor& expert_in_rank_num_list,
paddle::Tensor& tokens_per_expert_stats_list,
const paddle::optional<paddle::Tensor>& bias,
const int moe_topk,
const bool apply_norm_weight,
const bool enable_softmax_top_k_fused,
const int redundant_ep_rank_num_plus_one);
std::vector<paddle::Tensor>
MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
const paddle::Tensor &expert_id_to_ep_rank_array,
const paddle::Tensor &expert_in_rank_num_list,
paddle::Tensor &tokens_per_expert_stats_list,
const paddle::optional<paddle::Tensor> &bias,
const int moe_topk, const bool apply_norm_weight,
const bool enable_softmax_top_k_fused,
const int redundant_ep_rank_num_plus_one);
std::vector<paddle::Tensor>
EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
const paddle::Tensor &topk_weights,
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
const std::vector<int> &token_nums_per_expert,
const int token_nums_this_rank,
const std::string &moe_quant_type);
const paddle::Tensor &topk_weights,
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
const std::vector<int> &token_nums_per_expert,
const int token_nums_this_rank,
const std::string &moe_quant_type);
std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
const paddle::Tensor &input, const paddle::Tensor &scale,
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
const std::vector<int> &token_nums_per_expert,
const std::vector<int> &token_nums_per_expert_padded,
const int token_nums_this_rank, const int token_nums_this_rank_padded);
const paddle::Tensor &token_nums_per_expert,
const paddle::Tensor &token_nums_per_expert_padded);
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
const int block_size);
@@ -180,20 +175,35 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
const paddle::optional<paddle::Tensor> &ffn2_bias,
const bool norm_topk_prob, const float routed_scaling_factor);
std::vector<std::vector<int>> GetExpertTokenNum(
const paddle::Tensor& topk_ids,
const int num_experts);
std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
const int num_experts);
paddle::Tensor MoeExpertFFNFunc(
const paddle::Tensor &permute_input,
const paddle::Tensor &tokens_expert_prefix_sum,
const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
const paddle::optional<paddle::Tensor> &ffn1_bias,
const paddle::optional<paddle::Tensor> &ffn1_scale,
const paddle::optional<paddle::Tensor> &ffn2_scale,
const paddle::optional<paddle::Tensor> &ffn2_in_scale,
const paddle::optional<paddle::Tensor> &expert_idx_per_token,
const std::string &quant_method, const bool used_in_ep_low_latency);
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const paddle::optional<paddle::Tensor>& ffn2_in_scale,
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
const std::string& quant_method, const bool used_in_ep_low_latency);
paddle::Tensor MoeExpertFFNWint2Func(
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& ffn1_weight,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const paddle::optional<paddle::Tensor>& ffn1_local_scale,
const paddle::optional<paddle::Tensor>& ffn1_code_scale,
const paddle::optional<paddle::Tensor>& ffn1_code_zp,
const paddle::optional<paddle::Tensor>& ffn2_local_scale,
const paddle::optional<paddle::Tensor>& ffn2_code_scale,
const paddle::optional<paddle::Tensor>& ffn2_code_zp,
const bool used_in_ep_low_latency);
paddle::Tensor MoeExpertReduceFunc(
const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
@@ -205,19 +215,16 @@ paddle::Tensor MoeExpertReduceFunc(
void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
const paddle::Tensor &seq_lens_this_time_tensor,
const paddle::Tensor &seq_lens_decoder_tensor,
const int rank,
const int num_layers);
const int rank, const int num_layers);
void GetOutputKVSignal(const paddle::Tensor& x,
int64_t rank_id,
void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id,
bool wait_flag);
paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
const paddle::Tensor &out_scale,
std::string dtype);
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
const bool keep_pd_step_flag);
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
@@ -286,61 +293,121 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
std::vector<paddle::Tensor> MoEDeepGEMMPermute(
const paddle::Tensor& x,
const paddle::Tensor& topk_idx,
const int num_experts,
const int max_tokens_per_expert
);
std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
const paddle::Tensor &topk_idx,
const int num_experts,
const int max_tokens_per_expert);
std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
const paddle::Tensor& ffn_out, // [num_experts, max_tokens_per_expert, hidden]
const paddle::Tensor& permute_indices_per_token, // [token_num, topk}]
const paddle::Tensor& topk_idx,
const paddle::Tensor& topk_weights
);
const paddle::Tensor
&ffn_out, // [num_experts, max_tokens_per_expert, hidden]
const paddle::Tensor &permute_indices_per_token, // [token_num, topk}]
const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
void TextImageIndexOut(const paddle::Tensor &token_type_ids,
const paddle::Tensor &text_input,
const paddle::Tensor &image_input);
void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
paddle::Tensor &image_input,
paddle::Tensor &token_type_ids,
paddle::Tensor &text_index,
paddle::Tensor &image_index, const bool is_scatter);
paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
int64_t num_experts);
std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M);
std::vector<paddle::Tensor> MoeWna16MarlinGemmApi(
const paddle::Tensor& a,
const paddle::optional<paddle::Tensor>& c_or_none,
const paddle::Tensor& b_q_weight,
const paddle::Tensor& b_scales,
const paddle::optional<paddle::Tensor>& global_scale_or_none,
const paddle::optional<paddle::Tensor>& b_zeros_or_none,
const paddle::optional<paddle::Tensor>& g_idx_or_none,
const paddle::optional<paddle::Tensor>& perm_or_none,
const paddle::Tensor& workspace,
const paddle::Tensor& sorted_token_ids,
const paddle::Tensor& expert_ids,
const paddle::Tensor& num_tokens_post_padded,
const paddle::Tensor& topk_weights,
int64_t moe_block_size,
int64_t top_k,
bool mul_topk_weights,
bool is_ep,
const std::string& b_q_type_str,
int64_t size_m,
int64_t size_n,
int64_t size_k,
bool is_k_full,
bool use_atomic_add,
bool use_fp32_reduce,
bool is_zp_float);
void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a,
paddle::Tensor const &b, paddle::Tensor const &a_scales,
paddle::Tensor const &b_scales,
paddle::optional<paddle::Tensor> const &bias);
void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a,
paddle::Tensor const& b,
paddle::Tensor const& a_scales,
paddle::Tensor const& b_scales,
paddle::Tensor const& azp_adj,
paddle::optional<paddle::Tensor> const& azp,
paddle::optional<paddle::Tensor> const& bias);
void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
paddle::Tensor const &scale);
void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
paddle::Tensor &scale);
void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out,
paddle::Tensor const &input,
paddle::Tensor &scales, float scale_ub);
PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_expert_token_num", &GetExpertTokenNum,
py::arg("topk_ids"), py::arg("num_experts"),
"get expert token num");
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
py::arg("num_experts"), "get expert token num");
/**
* moe/fused_moe/moe_redundant_topk_select.cu
* moe_redundant_topk_select
*/
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
py::arg("expert_in_rank_num_list"),
py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
py::arg("moe_topk"), py::arg("apply_norm_weight"),
py::arg("enable_softmax_top_k_fused"),
py::arg("redundant_ep_rank_num_plus_one"),
"moe export RedundantTopKSelect function");
/**
* moe/fused_moe/moe_redundant_topk_select.cu
* moe_redundant_topk_select
*/
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
py::arg("expert_in_rank_num_list"), py::arg("tokens_per_expert_stats_list"),
py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"),
py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"),
"moe export RedundantTopKSelect function");
/**
* open_shm_and_get_meta_signal.cc
* InitKVSignalPerQuery
*/
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
py::arg("seq_lens_encoder_tensor"),
py::arg("seq_lens_this_time_tensor"),
py::arg("seq_lens_decoder_tensor"), py::arg("rank"),
py::arg("num_layers"), "init_kv_signal_per_query function");
/**
* GetOutputKVSignal
*/
m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"),
py::arg("rank_id"), py::arg("wait_flag"),
"get_output_kv_signal function");
/**
* open_shm_and_get_meta_signal.cc
* InitKVSingnalPerQuery
*/
m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"),
py::arg("seq_lens_decoder_tensor"), py::arg("rank"), py::arg("num_layers"),
"init_kv_signal_per_query function");
/**
* GetOutputKVSignal
*/
m.def("get_output_kv_signal", &GetOutputKVSignal,
py::arg("x"), py::arg("rank_id"), py::arg("wait_flag"),
"get_output_kv_signal function");
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute,
"MoEDeepGEMMDePermute");
/**
* alloc_cache_pinned.cc
* cuda_host_alloc
@@ -398,12 +465,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
py::arg("moe_quant_type"), "ep moe export dispatch function");
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8, py::arg("input"),
py::arg("scale"), py::arg("topk_ids"), py::arg("topk_weights"),
py::arg("token_nums_per_expert"),
py::arg("token_nums_per_expert_padded"),
py::arg("token_nums_this_rank"), py::arg("token_nums_this_rank_padded"),
"ep moe export dispatch function");
m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8);
m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
@@ -437,6 +499,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
*/
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
/**
* moe/fused_moe/moe_ffn_wint2.cu
* moe_expert_ffn_wint2
*/
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
/**
* moe/fused_moe/moe_expert_reduce.cu
* moe_expert_reduce
@@ -523,4 +591,66 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
"group_swiglu_with_masked function");
m.def("text_image_index_out", &TextImageIndexOut,
"text_image_index_out function");
m.def("text_image_gather_scatter", &TextImageGatherScatter,
"text_image_gather_scatter function");
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
py::arg("a"),
py::arg("c_or_none"),
py::arg("b_q_weight"),
py::arg("b_scales"),
py::arg("global_scale_or_none"),
py::arg("b_zeros_or_none"),
py::arg("g_idx_or_none"),
py::arg("perm_or_none"),
py::arg("workspace"),
py::arg("sorted_token_ids"),
py::arg("expert_ids"),
py::arg("num_tokens_post_padded"),
py::arg("topk_weights"),
py::arg("moe_block_size"),
py::arg("top_k"),
py::arg("mul_topk_weights"),
py::arg("is_ep"),
py::arg("b_q_type_str"),
py::arg("size_m"),
py::arg("size_n"),
py::arg("size_k"),
py::arg("is_k_full"),
py::arg("use_atomic_add"),
py::arg("use_fp32_reduce"),
py::arg("is_zp_float"));
/**
* cutlass_scaled_mm.cu
* cutlass_scaled_mm
* cutlass_scaled_mm_azp
*/
m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function");
m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function");
/**
* quantization/common.cu
* static_scaled_fp8_quant
* dynamic_scaled_fp8_quant
* dynamic_per_token_scaled_fp8_quant
*/
m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function",
py::arg("out"), py::arg("input"), py::arg("scale"));
m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant,
"dynamic_scaled_fp8_quant function",
py::arg("out"), py::arg("input"), py::arg("scale"));
m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
"dynamic_per_token_scaled_fp8_quant function",
py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
}

View File

@@ -0,0 +1,250 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Architecture-specific operators on memory added for SM80
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/complex.h"
#include "cutlass/arch/memory.h"
#include "cutlass/arch/memory_sm75.h"
#include "cutlass/arch/memory_sm80.h"
#include "cutlass/arch/cache_operation.h"
namespace cutlass {
namespace arch {
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Initiates an asynchronous copy from global memory to shared memory.
///
/// cp.async
///
template <
/// Size of the access in bytes
int SizeInBytes,
/// Cache operation
CacheOperation::Kind cache_op = CacheOperation::Always,
bool GlobalToShared = true>
struct copy;
/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
///
/// cp.async
///
template <
/// Size of the access in bytes
int SizeInBytes,
/// Cache operation
CacheOperation::Kind cache_op = CacheOperation::Always,
bool GlobalToShared = true>
struct copy_zfill;
/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
///
/// cp.async
///
template <int N, bool GlobalToShared = true>
struct copy_wait;
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy<SizeInBytes, CacheOperation::Always, true> {
/// Copy
CUTLASS_DEVICE
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
cp_async<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy<SizeInBytes, CacheOperation::Always, false> {
/// Copy
CUTLASS_DEVICE
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
using AccessType = Array<uint8_t, SizeInBytes>;
if (pred_guard) {
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
}
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy_zfill<SizeInBytes, CacheOperation::Always, true> {
/// Copy with zero fill
CUTLASS_DEVICE
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
cp_async_zfill<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy_zfill<SizeInBytes, CacheOperation::Always, false> {
/// Copy with zero fill
CUTLASS_DEVICE
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
using AccessType = Array<uint8_t, SizeInBytes>;
if (pred_guard) {
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
}
else {
AccessType zeros;
zeros.clear();
*static_cast<AccessType *>(smem_ptr) = zeros;
}
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy<SizeInBytes, CacheOperation::Global, true> {
/// Copy
CUTLASS_DEVICE
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
cp_async<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy<SizeInBytes, CacheOperation::Global, false> {
/// Copy
CUTLASS_DEVICE
copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
using AccessType = Array<uint8_t, SizeInBytes>;
if (pred_guard) {
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
}
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy_zfill<SizeInBytes, CacheOperation::Global, true> {
/// Copy with zero fill
CUTLASS_DEVICE
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
cp_async_zfill<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
}
};
/// Partial specialization
template <
/// Size of the access in bytes
int SizeInBytes>
struct copy_zfill<SizeInBytes, CacheOperation::Global, false> {
/// Copy with zero fill
CUTLASS_DEVICE
copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
using AccessType = Array<uint8_t, SizeInBytes>;
if (pred_guard) {
*static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
}
else {
AccessType zeros;
zeros.clear();
*static_cast<AccessType *>(smem_ptr) = zeros;
}
}
};
/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
template <bool GlobalToShared>
CUTLASS_DEVICE
void copy_fence() {}
template <>
CUTLASS_DEVICE
void copy_fence<true>() {
cp_async_fence();
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Partial specialization
template <int N>
struct copy_wait<N, false> {
CUTLASS_DEVICE
copy_wait() {}
};
/// Partial specialization
template <int N>
struct copy_wait<N, true> {
CUTLASS_DEVICE
copy_wait() { cp_async_wait<N>(); }
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace arch
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

Some files were not shown because too many files have changed in this diff Show More