Sync v2.0 version of code to github repo

2025-10-03 15:56:49 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,29 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
+# The -i means inplace change.
+#
+# The document of clang-format is
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     4
+TabWidth:        2
+ContinuationIndentWidth: 4
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
+Standard:  Cpp11
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+IncludeBlocks: Preserve
+IncludeIsMainSourceRegex: (\.cu)$
+...
--- a/.gitignore
+++ b/.gitignore
@@ -121,7 +121,7 @@ dmypy.json
 FETCH_HEAD

 #log
-log/
+log*/

 checkpoints/
 checkpoints_origin/
@@ -158,3 +158,7 @@ custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute

 # buff
 custom_ops/tmp*
+
+build
+
+.ccls-cache
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
  rev: v0.11.7
  hooks:
  - id: ruff
-    args: [--output-format, github, --fix]
+    args: [--output-format, github, --fix, --line-length=120]
 # # 拼写检查
 # - repo: https://github.com/codespell-project/codespell
 #   rev: v2.4.1
@@ -29,14 +29,15 @@ repos:
  rev: 6.0.1
  hooks:
  - id: isort
-# 格式化
- repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
-  hooks:
-  - id: clang-format
-    # exclude: '.*'
-    types_or: [c++, cuda]
-    args: [--style=file, --verbose]
+# # 格式化
+# - repo: https://github.com/pre-commit/mirrors-clang-format
+#   rev: v20.1.3
+#   hooks:
+#   - id: clang-format
+#     # exclude: '.*'
+#     types_or: [c++, cuda]
+#     args: [--style=file, --verbose]
+
 # markdown
 - repo: https://github.com/jackdewinter/pymarkdown
  rev: v0.9.29
--- a/README.md
+++ b/README.md
@@ -1,9 +1,8 @@
-# FastDeploy 2.0: 大模型推理部署
-
 <p align="center">
-    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
-    <a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
-    <a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
+  <a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
+</p>
+<p align="center">
+    <a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
@@ -11,105 +10,78 @@
    <a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
 </p>

-FastDeploy升级2.0版本支持多种大模型推理（当前仅支持Qwen2，更多模型即将更新支持)，其推理部署功能涵盖：
+<p align="center">
+    <a href="docs/get_started/installation/README.md"><b> Installation </b></a>
+    |
+    <a href="docs/get_started.md"><b> Quick Start </b></a>
+    |
+    <a href="docs/supported_models.md"><b> Supported Models </b></a>
+</p>

- 一行命令即可快速实现模型的服务化部署，并支持流式生成
- 利用张量并行技术加速模型推理
- 支持 PagedAttention 与 continuous batching（动态批处理）
- 兼容 OpenAI 的 HTTP 协议
- 提供 Weight only int8/int4 无损压缩方案
- 支持 Prometheus Metrics 指标
+--------------------------------------------------------------------------------
+# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle

-> 注意: 如果你还在使用FastDeploy部署小模型(如PaddleClas/PaddleOCR等CV套件模型)，请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
+## News

-## 环境依赖
- A800/H800/H100
- Python>=3.10
- CUDA>=12.3
- CUDNN>=9.5
- Linux X64
+**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.

-## 安装
+## About

-### Docker安装(推荐)
-```
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy:2.0.0.0-alpha
-```
+**FastDeploy** is an inference and deployment toolkit for large language models and visual language models based on PaddlePaddle. It delivers **production-ready, out-of-the-box deployment solutions** with core acceleration technologies:

-### 源码安装
-#### 安装PaddlePaddle
-> 注意安装nightly build版本，代码版本需新于2025.05.30，详见[PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)，指定安装CUDA 12.6 develop(Nightly build)版本。
-```
-python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
-```
+- 🚀 **Load-Balanced PD Disaggregation**: Industrial-grade solution featuring context caching and dynamic instance role switching. Optimizes resource utilization while balancing SLO compliance and throughput.
+- 🔄 **Unified KV Cache Transmission**: Lightweight high-performance transport library with intelligent NVLink/RDMA selection.
+- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
+- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
+- ⏩ **Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
+- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.

-#### 编译安装FastDeploy
+## Requirements

-```
-# 编译
-cd FastDeploy
-bash build.sh
-# 安装
-pip install dist/fastdeploy-2.0.0a0-py3-none-any.whl
-```
+- OS: Linux
+- Python: 3.10 ~ 3.12

-## 快速使用
+## Installation

-在安装后，执行如下命令快速部署Qwen2模型, 更多参数的配置与含义参考[参数说明](docs/serving.md).
+FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:

-``` shell
-# 下载与解压Qwen模型
-wget https://fastdeploy.bj.bcebos.com/llm/models/Qwen2-7B-Instruct.tar.gz && tar xvf Qwen2-7B-Instruct.tar.gz
-# 指定单卡部署
-python -m fastdeploy.entrypoints.openai.api_server --model ./Qwen2-7B-Instruct --port 8188 --tensor-parallel-size 1
-```
+- [NVIDIA GPU](./docs/installation/nvidia_cuda.md)
+- [Kunlunxin XPU](./docs/en/get_started/installation/kunlunxin_xpu.md)
+- [Iluvatar GPU](./docs/en/get_started/installation/iluvatar_gpu.md)
+- [Enflame GCU](./docs/en/get_started/installation/Enflame_gcu.md)

-使用如下命令请求模型服务
-``` shell
-curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
-  "messages": [
-    {"role": "user", "content": "你好，你的名字是什么？"}
-  ]
-}'
-```
-响应结果如下所示
-``` json
-{
-    "id": "chatcmpl-db662f47-7c8c-4945-9a7a-db563b2ddd8d",
-    "object": "chat.completion",
-    "created": 1749451045,
-    "model": "default",
-    "choices": [
-        {
-            "index": 0,
-            "message": {
-                "role": "assistant",
-                "content": "你好！我叫通义千问。",
-                "reasoning_content": null
-            },
-            "finish_reason": "stop"
-        }
-    ],
-    "usage": {
-        "prompt_tokens": 25,
-        "total_tokens": 35,
-        "completion_tokens": 10,
-        "prompt_tokens_details": null
-    }
-}
-```
-FastDeploy提供与OpenAI完全兼容的服务API(字段`model`与`api_key`目前不支持，设定会被忽略)，用户也可基于openai python api请求服务。
+**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!

-## 部署文档
- [本地部署](docs/offline_inference.md)
- [服务部署](docs/serving.md)
- [服务metrics](docs/metrics.md)
+## Get Started

-# 代码说明
- [代码目录说明](docs/code_guide.md)
- FastDeploy的使用中存在任何建议和问题，欢迎通过issue反馈。
+Learn how to use FastDeploy through our documentation:
+- [10-Minutes Quick Deployment](./docs/get_started/quick_start.md)
+- [ERNIE-4.5 Large Language Model Deployment](./docs/get_started/ernie-4.5.md)
+- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
+- [Offline Inference Development](./docs/offline_inference.md)
+- [Online Service Deployment](./docs/serving/README.md)
+- [Full Supported Models List](./docs/supported_models.md)

-# 开源说明
-FastDeploy遵循[Apache-2.0开源协议](./LICENSE)。 在本项目的开发中，为了对齐[vLLM](https://github.com/vllm-project/vllm)使用接口，参考和直接使用了部分vLLM代码，在此表示感谢。
+## Supported Models
+
+| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
+|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
+|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅（WINT4/W4A8C8/Expert Parallelism)| ✅ | ✅|✅(WINT4)| WIP |128K |
+|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅（WINT4/Expert Parallelism)| ✅ | ✅|✅(WINT4)| ❌ | 128K |
+|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
+|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
+|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
+|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
+|ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅| 128K |
+
+## Advanced Usage
+
+- [Quantization](./docs/quantization/README.md)
+- [PD Disaggregation Deployment](./docs/features/pd_disaggregation.md)
+- [Speculative Decoding](./docs/features/speculative_decoding.md)
+- [Prefix Caching](./docs/features/prefix_caching.md)
+- [Chunked Prefill](./docs/features/chunked_prefill.md)
+
+## Acknowledgement
+
+FastDeploy is licensed under the [Apache-2.0 open-source license](./LICENSE). During development, portions of [vLLM](https://github.com/vllm-project/vllm) code were referenced and incorporated to maintain interface compatibility, for which we express our gratitude.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,106 @@
+### FastDeploy服务化性能压测工具
+
+#### 数据集：
+
+wget下载到本地用于性能测试
+
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr>
+      <th style="width:15%; text-align: left;">Dataset</th>
+      <th style="width:65%; text-align: left;">Data Path</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>开源数据集 2k条</strong></td>
+      <td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
+    </tr>
+  </tbody>
+</table>
+#### 使用方式：
+
+```
+# 安装依赖
+python -m pip install -r requirements.txt
+```
+
+##### 参数说明
+
+```bash
+--backend openai-chat：压测使用的后端接口，指定为"openai-chat"使用chat/completion接口
+--model EB45T：模型名，任意取名，影响最后保存的结果文件名 EB45T \
+--endpoint /v1/chat/completions：endpoint，用于组url
+--host 0.0.0.0：服务ip地址，用于组url
+--port 9812：服务HTTP端口，用于组url
+--dataset-name EBChat：指定数据集类，指定为"EBChat"可读取转存的FD格式数据集
+--dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd：压测数据集路径
+--hyperparameter-path EB45T.yaml：(可选)超参文件，请求时会更新进payload中，默认不带任何超参
+--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len：性能结果中展示的指标集合
+--metric-percentiles 80,95,99,99.9,99.95,99.99：性能结果中展示的性能指标分位值
+--num-prompts 1：总计发送多少条请求
+--max-concurrency 1：压测并发数
+--save-result：开启结果保存，结果文件会存入json
+```
+
+##### /v1/chat/completions接口压测单条数据调试
+
+```
+python benchmark_serving.py \
+  --backend openai-chat \
+  --model EB45T \
+  --endpoint /v1/chat/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 1 \
+  --max-concurrency 1 \
+  --save-result
+```
+
+##### /v1/chat/completions接口完整100并发 2000条压测
+
+```
+# 保存infer_log.txt
+python benchmark_serving.py \
+  --backend openai-chat \
+  --model EB45T \
+  --endpoint /v1/chat/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 2000 \
+  --max-concurrency 100 \
+  --save-result > infer_log.txt 2>&1 &
+```
+
+##### /v1/completions接口压测
+
+修改endpoint为/v1/completions，backend为openai，会对/v1/completions接口进行压测
+
+```
+# 保存infer_log.txt
+python benchmark_serving.py \
+  --backend openai \
+  --model EB45T \
+  --endpoint /v1/completions \
+  --host 0.0.0.0 \
+  --port 9812 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
+  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
+  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
+  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
+  --num-prompts 2000 \
+  --max-concurrency 100 \
+  --save-result > infer_log.txt 2>&1 &
+```
+
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,700 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
+
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """Input for requesting LLMs via API"""
+    prompt: str
+    history_QA: Optional[dict]
+    hyper_parameters: dict
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    """Output for requesting LLMs via API"""
+    generated_text: str = ""
+    reasoning_content: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    arrival_time: list = field(default_factory=list)  # arrival_time
+    itl: list = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    prompt_tokens: int = 0 # 推理侧返回输入token数
+    error: str = ""
+
+
+async def async_request_eb_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Chat Completions API URL must end with 'completions'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": "default",
+            "messages": request_func_input.history_QA,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = 0
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                reason_content = choices[0]["delta"].get("reasoning_content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+                                    # cached_tokens
+                                    output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                output.generated_text += content or ""
+                                output.reasoning_content += reason_content or ""
+                                output.arrival_time.append(choices[0].get("arrival_time"))
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                                output.prompt_tokens = usage.get(
+                                    "prompt_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    # output.generated_text = generated_text
+                    if output.generated_text.strip() == "":
+                        output.success = False
+                        output.error = "No generated text found!"
+                    else:
+                        output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    error_text = await response.text()
+                    print("####error response:", error_text, "####payload:", payload)
+                    output.error = error_text or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        # 保存失败请求结果
+        if not output.success:
+            with open("error_output.txt", "a") as f:
+                f.write(str(output) + "\n")
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_eb_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": "default",
+            "prompt": request_func_input.prompt,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, chunk.usage)
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                output.arrival_time.append(choices[0].get("arrival_time"))
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.prompt_tokens = usage.get(
+                                    "prompt_tokens")
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using the TGI API"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+                        output.arrival_time.append(data["arrival_time"])
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using TRT's llm_server"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using Deepspeed MII"""
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            # "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            #"stream_options": {
+            #    "include_usage": True,
+            #},
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations"
+         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+            form = aiohttp.FormData()
+            form.add_field('file', f, content_type='audio/wav')
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url,
+                                        data=form,
+                                        headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                                "data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get(
+                                        "content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_eb_openai_completions,
+    "openai-chat": async_request_eb_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_eb_openai_chat_completions)
+]
+
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,309 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
+
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+from PIL import Image
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    history_QA: Union[str, Any]
+    json_data: Optional[dict]
+    prompt_len: int
+    expected_output_len: int
+
+
+class BenchmarkDataset(ABC):
+    """BenchmarkDataset"""
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+        hyperparameter_path: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+        self.hyperparameter_path = hyperparameter_path
+        self.hyperparameters = {}
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    @abstractmethod
+    def sample(self, num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
+
+
+class EBDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    temperature: float
+    repetition_penalty: float
+    frequency_penalty: float
+    presence_penalty: float
+    top_p: float
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt = entry["text"]
+            self.temperature = float(entry["temperature"])
+            self.repetition_penalty = float(entry["penalty_score"])
+            self.frequency_penalty = float(entry["frequency_score"])
+            self.presence_penalty = float(entry["presence_score"])
+            self.top_p = float(entry["topp"])
+            self.prompt_len = int(entry["input_token_num"])
+            new_output_len = int(entry["max_dec_len"])
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=self.prompt_len,
+                    history_QA=[],
+                    expected_output_len=new_output_len,
+                ))
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+class EBChatDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            json_data = entry
+            prompt = entry["messages"][-1].get("content", "")
+            history_QA = entry.get("messages", [])
+            new_output_len = int(entry.get("max_tokens", 12288))
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
+            samples.append(
+                SampleRequest(
+                    json_data=json_data,
+                    prompt=prompt,
+                    prompt_len=0,
+                    history_QA=history_QA,
+                    expected_output_len=new_output_len,
+                ))
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,90 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
+
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+    """InfEncoder"""
+    def clear_inf(self, o: Any):
+        """clear_inf"""
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        """iterencode"""
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    """write_to_json"""
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
+
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,5 @@
+aiohttp
+tqdm
+numpy
+Pillow
+pyyaml
--- a/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
@@ -0,0 +1,8 @@
+enable_chunked_prefill: True
+max_model_len: 131072
+max_num_seqs: 16
+kv_cache_ratio: 0.75
+tensor_parallel_size: 8
+max_num_batched_tokens: 4096
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 131072
+max_num_seqs: 40
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint4
--- a/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
@@ -0,0 +1,8 @@
+enable_chunked_prefill: True
+max_model_len: 131072
+max_num_seqs: 16
+kv_cache_ratio: 0.75
+tensor_parallel_size: 8
+max_num_batched_tokens: 4096
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
@@ -0,0 +1,10 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 1
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+quantization: wint4
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 32
+kv_cache_ratio: 0.5
+tensor_parallel_size: 1
+quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
+quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
+quantization: wint8
--- a/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 256
+tensor_parallel_size: 8
+quantization: block_wise_fp8
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+enable_chunked_prefill: True
+max_num_batched_tokens: 1024
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
+enable_prefix_caching: True
+swap_space: 200
--- a/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
@@ -0,0 +1,11 @@
+max_model_len: 32768
+max_num_seqs: 256
+tensor_parallel_size: 8
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+enable_chunked_prefill: True
+max_num_batched_tokens: 1024
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
+enable_prefix_caching: True
+swap_space: 200
--- a/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
@@ -0,0 +1,15 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: True
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
+max_num_batched_tokens: 384
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: True
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_prefix_caching: true
+enable_chunked_prefill: true
--- a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 1
+data_parallel_size: 8
+num_gpu_blocks_override: 1024
+cache_queue_port: 55663
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma"
+rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 1
+data_parallel_size: 8
+splitwise_role: prefill
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+num_gpu_blocks_override: 1024
+cache_transfer_protocol: "rdma"
+rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+quantization: wint4
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
@@ -0,0 +1,13 @@
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.7
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: False
+enable_prefix_caching: False
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: False
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 40
+tensor_parallel_size: 4
+quantization: wint4
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 160
+tensor_parallel_size: 8
+quantization: wint4
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
@@ -0,0 +1,8 @@
+enable_prefix_caching: True
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+swap_space: 200
+cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
@@ -0,0 +1,15 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+cache_queue_port: 55663
+enable_chunked_prefill: True
+splitwise_role: decode
+engine_worker_queue_port: 6678
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7671,7672,7673,7674"
+pd_comm_port: "2334"
+max_num_batched_tokens: 384
+max_num_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
@@ -0,0 +1,12 @@
+max_model_len: 32768
+max_num_seqs: 16
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.9
+tensor_parallel_size: 4
+splitwise_role: prefill
+enable_prefix_caching: True
+cache_queue_port: 55664
+engine_worker_queue_port: 6677
+cache_transfer_protocol: "rdma,ipc"
+rdma_comm_ports: "7675,7676,7677,7678"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 8
--- a/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 80
+tensor_parallel_size: 8
+quantization: wint8
+gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_prefix_caching: True
+max_model_len: 32768
+max_num_batched_tokens: 68304
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 8
+swap_space: 100
+cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 56
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,11 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 56
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+quantization: wint4
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.95
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,11 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.8
+tensor_parallel_size: 8
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+enable_chunked_prefill: True
+max_num_batched_tokens: 384
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
@@ -0,0 +1,9 @@
+enable_mm: True
+max_model_len: 32768
+max_num_seqs: 36
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.8
+tensor_parallel_size: 4
+quantization: wint8
+limit_mm_per_prompt: '{"image": 100, "video": 100}'
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
+++ b/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 96
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.71
+tensor_parallel_size: 4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,4 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wfp8afp8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wfp8afp8
--- a/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint8
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
+quantization: wint4
+enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 256
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 75
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 25
+gpu_memory_utilization: 0.9
+kv_cache_ratio: 0.75
+quantization: wint8
+tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.75
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 50
+gpu_memory_utilization: 0.8
+kv_cache_ratio: 0.75
+quantization: wint4
+tensor_parallel_size: 1
--- a/benchmarks/yaml/request_yaml/eb45-128k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-128k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.8
+metadata:
+  min_tokens: 1
+max_tokens: 131071
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/eb45-32k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.8
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.7
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.05
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.8
+temperature: 0.7
+metadata:
+  min_tokens: 1
+max_tokens: 12288
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 1.5
--- a/benchmarks/yaml/request_yaml/x1-32k.yaml
+++ b/benchmarks/yaml/request_yaml/x1-32k.yaml
@@ -0,0 +1,8 @@
+top_p: 0.95
+temperature: 0.6
+metadata:
+  min_tokens: 1
+max_tokens: 32767
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,6 @@
+tensor_parallel_size: 8
+max_model_len: 32768
+max_num_seqs: 32
+num_gpu_blocks_override: 4096
+kv_cache_ratio: 0.5
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 32
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 4
+quantization: wint4
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 128
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint4
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
+enable_prefix_caching: True
+num_gpu_blocks_override: 8000
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.5
+tensor_parallel_size: 8
+swap_space: 200
+cache_queue_port: 55664
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,6 @@
+tensor_parallel_size: 8
+max_model_len: 32768
+max_num_seqs: 32
+num_gpu_blocks_override: 4096
+kv_cache_ratio: 0.5
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 8
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 4
+quantization: wint8
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,6 @@
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.9
+tensor_parallel_size: 8
+quantization: wint8
+reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
+enable_prefix_caching: True
+num_gpu_blocks_override: 8000
+max_model_len: 32768
+max_num_seqs: 64
+gpu_memory_utilization: 0.85
+kv_cache_ratio: 0.5
+tensor_parallel_size: 8
+swap_space: 200
+cache_queue_port: 55664
+reasoning_parser: ernie-x1
--- a/build.sh
+++ b/build.sh
@@ -17,8 +17,9 @@
 BUILD_WHEEL=${1:-1}
 PYTHON_VERSION=${2:-"python"}
 export python=$PYTHON_VERSION
-CPU_USE_BF16=${3:-"false"}
-BUILDING_ARCS=${4:-""}
+FD_CPU_USE_BF16=${3:-"false"}
+FD_BUILDING_ARCS=${4:-""}
+

 # paddle distributed use to set archs
 unset PADDLE_CUDA_ARCH_LIST
@@ -30,13 +31,9 @@ EGG_DIR="fastdeploy.egg-info"

 # custom_ops directory config
 OPS_SRC_DIR="custom_ops"
-OPS_BUILD_DIR="build"
-OPS_EGG_DIR="efficitentllm_ops.egg-info"
 OPS_TMP_DIR_BASE="tmp_base"
 OPS_TMP_DIR="tmp"

-TEST_DIR="tests"
-
 # command line log config
 RED='\033[0;31m'
 BLUE='\033[0;34m'
@@ -44,13 +41,14 @@ GREEN='\033[1;32m'
 BOLD='\033[1m'
 NONE='\033[0m'

+DEVICE_TYPE="gpu"

 function python_version_check() {
  PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
  PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
  echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
-  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "8" ]; then
-    echo -e "${RED}FAIL:${NONE} please use Python >= 3.8"
+  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "9" ]; then
+    echo -e "${RED}FAIL:${NONE} please use Python >= 3.9"
    exit 1
  fi
 }
@@ -75,6 +73,7 @@ function copy_ops(){
    WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
    is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
    if [ "$is_rocm" = "True" ]; then
+      DEVICE_TYPE="rocm"
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
      echo -e "ROCM ops have been copy to fastdeploy"
      return
@@ -82,6 +81,7 @@ function copy_ops(){
    mkdir -p ../fastdeploy/model_executor/ops/base
    is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
    if [ "$is_cuda" = "True" ]; then
+      DEVICE_TYPE="gpu"
      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
      echo -e "BASE and CUDA ops have been copy to fastdeploy"
@@ -90,6 +90,7 @@ function copy_ops(){

    is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
    if [ "$is_xpu" = "True" ]; then
+      DEVICE_TYPE="xpu"
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
      echo -e "xpu ops have been copy to fastdeploy"
      return
@@ -97,20 +98,14 @@ function copy_ops(){

    is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
    if [ "$is_npu" = "True" ]; then
+      DEVICE_TYPE="npu"
      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
      echo -e "npu ops have been copy to fastdeploy"
      return
    fi

+    DEVICE_TYPE="cpu"
    cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
-    cd ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/xFasterTransformer/build/
-    for file in *_pd_.so; do
-      mv "$file" "${file/_pd_/}"
-    done
-    cd ../../x86-simd-sort/builddir/
-    for file in *_pd_.so; do
-      mv "$file" "${file/_pd_/}"
-    done
    cd ../../../../
    cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
    echo -e "BASE and CPU ops have been copy to fastdeploy"
@@ -122,15 +117,30 @@ function build_and_install_ops() {
  export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
  echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
  ${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
+  find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
  echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
-  if [ "$CPU_USE_BF16" == "true" ]; then
-      CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
-      :
-  elif [ "$CPU_USE_BF16" == "false" ]; then
+  TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
+  is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
+  if [ "$is_xpu" = "True" ]; then
+    cd xpu_ops/src
+    bash build.sh ${TMP_DIR_REAL_PATH}
+    cd ../..
+  elif [ "$FD_CPU_USE_BF16" == "true" ]; then
+    if [ "$FD_BUILDING_ARCS" == "" ]; then
+      FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
+    else
+      FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
+    fi
+    find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
+  elif [ "$FD_CPU_USE_BF16" == "false" ]; then
+    if [ "$FD_BUILDING_ARCS" == "" ]; then
      ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
-      :
+    else
+      FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
+    fi
+    find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
  else
-      echo "Error: Invalid parameter '$CPU_USE_BF16'. Please use true or false."
+      echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
      exit 1
  fi
  if [ $? -ne 0 ]; then
@@ -146,11 +156,7 @@ function build_and_install_ops() {

 function build_and_install() {
  echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
-  if [ "$BUILDING_ARCS" == "" ]; then
-      ${python} setup.py bdist_wheel --python-tag py3
-  else
-      BUILDING_ARCS=${BUILDING_ARCS} ${python} setup.py bdist_wheel --python-tag py3
-  fi
+  ${python} setup.py bdist_wheel --python-tag=py3

  if [ $? -ne 0 ]; then
    echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
@@ -174,10 +180,12 @@ function cleanup() {
  rm -rf $BUILD_DIR $EGG_DIR
  if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0  ]; then
    echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
-    ${python} -m pip uninstall -y fastdeploy
+    ${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
  fi

  rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
+  rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
+  rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
 }

 function abort() {
@@ -187,7 +195,7 @@ function abort() {
  cur_dir=`basename "$pwd"`

  rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
-  ${python} -m pip uninstall -y fastdeploy
+  ${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}

  rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
 }
--- a/custom_ops/0001-DeepGEMM-95e81b3.patch
+++ b/custom_ops/0001-DeepGEMM-95e81b3.patch
@@ -0,0 +1,643 @@
+From 5112002c155dceecc5e5983cdb67157e4f5400e2 Mon Sep 17 00:00:00 2001
+From: minghaipeng <minghaipeng@baidu.com>
+Date: Wed, 25 Jun 2025 15:05:24 +0800
+Subject: [PATCH] DeepGEMM 95e81b3
+
+---
+ deep_gemm/__init__.py                     |  2 +-
+ deep_gemm/include/deep_gemm/scheduler.cuh |  2 +-
+ deep_gemm/jit/compiler.py                 |  2 +-
+ deep_gemm/jit/interleave_ffma.py          |  2 +-
+ deep_gemm/jit/runtime.py                  |  4 +-
+ deep_gemm/jit/template.py                 | 34 ++++----
+ deep_gemm/jit_kernels/gemm.py             | 44 +++++------
+ deep_gemm/jit_kernels/m_grouped_gemm.py   | 96 +++++++++++------------
+ deep_gemm/jit_kernels/tuner.py            | 10 +--
+ deep_gemm/jit_kernels/utils.py            | 18 +++--
+ deep_gemm/paddle_utils.py                 | 20 +++++
+ deep_gemm/utils.py                        | 30 +++----
+ 12 files changed, 143 insertions(+), 121 deletions(-)
+ create mode 100644 deep_gemm/paddle_utils.py
+
+diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
+index 15b22ca..63e7fb7 100644
+--- a/deep_gemm/__init__.py
+++ b/deep_gemm/__init__.py
+@@ -1,4 +1,4 @@
+-import torch
+import paddle
+ 
+ from . import jit
+ from .jit_kernels import (
+diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh
+index 9743871..6c97152 100644
+--- a/deep_gemm/include/deep_gemm/scheduler.cuh
+++ b/deep_gemm/include/deep_gemm/scheduler.cuh
+@@ -102,7 +102,7 @@ struct Scheduler {
+         if constexpr (kGemmType == GemmType::Normal) {
+             return block_idx * block_size;
+         } else if constexpr (kGemmType == GemmType::GroupedContiguous) {
+-            auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
+            auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M));
+             return offset * shape_dim + block_idx * block_size;
+         } else if constexpr (kGemmType == GemmType::GroupedMasked) {
+             return curr_group_idx * shape_dim + block_idx * block_size;
+diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py
+index c17d466..6fdc52f 100644
+--- a/deep_gemm/jit/compiler.py
+++ b/deep_gemm/jit/compiler.py
+@@ -4,7 +4,7 @@ import os
+ import re
+ import subprocess
+ import uuid
+-from torch.utils.cpp_extension import CUDA_HOME
+from ..paddle_utils import CUDA_HOME
+ from typing import Tuple
+ 
+ from . import interleave_ffma
+diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/interleave_ffma.py
+index fcb377e..db9d6f3 100644
+--- a/deep_gemm/jit/interleave_ffma.py
+++ b/deep_gemm/jit/interleave_ffma.py
+@@ -3,7 +3,7 @@ import mmap
+ import os
+ import re
+ import subprocess
+-from torch.utils.cpp_extension import CUDA_HOME
+from ..paddle_utils import CUDA_HOME
+ 
+ 
+ def run_cuobjdump(file_path):
+diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py
+index 66c370a..4761426 100644
+--- a/deep_gemm/jit/runtime.py
+++ b/deep_gemm/jit/runtime.py
+@@ -1,6 +1,6 @@
+ import ctypes
+ import os
+-import torch
+import paddle
+ from typing import Optional
+ 
+ from .template import map_ctype
+@@ -35,7 +35,7 @@ class Runtime:
+         assert len(args) == len(self.args), f'Expected {len(self.args)} arguments, got {len(args)}'
+         cargs = []
+         for arg, (name, dtype) in zip(args, self.args):
+-            if isinstance(arg, torch.Tensor):
+            if isinstance(arg, paddle.Tensor):
+                 assert arg.dtype == dtype, f'Expected tensor dtype `{dtype}` for `{name}`, got `{arg.dtype}`'
+             else:
+                 assert isinstance(arg, dtype), f'Expected built-in type `{dtype}` for `{name}`, got `{type(arg)}`'
+diff --git a/deep_gemm/jit/template.py b/deep_gemm/jit/template.py
+index ead37f5..51b02c1 100644
+--- a/deep_gemm/jit/template.py
+++ b/deep_gemm/jit/template.py
+@@ -1,24 +1,24 @@
+ import copy
+ import ctypes
+ import os
+-import torch
+import paddle
+ from typing import Any, Dict, Iterable, Tuple
+ 
+ 
+ # Name map for Python `eval`
+ typename_map: Dict[Any, str] = {
+     **{t: t.__name__ for t in (bool, int, float)},
+-    torch.int: 'torch.int',
+-    torch.float: 'torch.float',
+-    torch.bfloat16: 'torch.bfloat16',
+-    torch.float8_e4m3fn: 'torch.float8_e4m3fn',
+-    torch.cuda.Stream: 'torch.cuda.Stream',
+    paddle.int32: 'paddle.int32',
+    paddle.float32: 'paddle.float32',
+    paddle.bfloat16: 'paddle.bfloat16',
+    paddle.float8_e4m3fn: 'paddle.float8_e4m3fn',
+    paddle.device.cuda.Stream: "paddle.device.cuda.Stream",
+ }
+ 
+ # `ctype` map for Python casting
+ ctype_map: Dict[Any, Any] = {
+     **{t: getattr(ctypes, f'c_{t.__name__}') for t in (bool, int, float)},
+-    **{t: ctypes.c_void_p for t in (torch.int, torch.float, torch.bfloat16, torch.float8_e4m3fn, torch.cuda.Stream)},
+    **{t: ctypes.c_void_p for t in (paddle.int32, paddle.float32, paddle.bfloat16, paddle.float8_e4m3fn, paddle.device.cuda.Stream)},
+ }
+ 
+ 
+@@ -27,25 +27,25 @@ genc_map = {
+     bool: ('bool', 'bool'),
+     int: ('int', 'int'),
+     float: ('float', 'float'),
+-    torch.int: ('void*', 'int*'),
+-    torch.float: ('void*', 'float*'),
+-    torch.bfloat16: ('void*', '__nv_bfloat16*'),
+-    torch.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
+-    torch.cuda.Stream: ('void*', 'cudaStream_t'),
+    paddle.int32: ('void*', 'int*'),
+    paddle.float32: ('void*', 'float*'),
+    paddle.bfloat16: ('void*', '__nv_bfloat16*'),
+    paddle.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
+    paddle.device.cuda.Stream: ('void*', 'cudaStream_t'),
+ }
+ 
+ 
+ def map_ctype(value: Any) -> Any:
+     if hasattr(value, 'data_ptr'):
+-        if value.dtype == torch.int:
+        if value.dtype == paddle.int32:
+             return ctypes.c_void_p(value.data_ptr())
+-        elif value.dtype == torch.float:
+        elif value.dtype == paddle.float32:
+             return ctypes.c_void_p(value.data_ptr())
+-        elif value.dtype == torch.bfloat16:
+        elif value.dtype == paddle.bfloat16:
+             return ctypes.c_void_p(value.data_ptr())
+-        elif value.dtype == torch.float16:
+        elif value.dtype == paddle.float16:
+             return ctypes.c_void_p(value.data_ptr())
+-        elif value.dtype == torch.float8_e4m3fn:
+        elif value.dtype == paddle.float8_e4m3fn:
+             return ctypes.c_void_p(value.data_ptr())
+         else:
+             return ctypes.c_void_p(value.data_ptr())
+diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py
+index cb438b7..44aa0ed 100644
+--- a/deep_gemm/jit_kernels/gemm.py
+++ b/deep_gemm/jit_kernels/gemm.py
+@@ -1,5 +1,5 @@
+ import math
+-import torch
+import paddle
+ from functools import lru_cache
+ from typing import Tuple
+ 
+@@ -166,20 +166,20 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
+     return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config
+ 
+ 
+-def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+-                         rhs: Tuple[torch.Tensor, torch.Tensor],
+-                         out: torch.Tensor) -> None:
+def gemm_fp8_fp8_bf16_nt(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+                         rhs: Tuple[paddle.Tensor, paddle.Tensor],
+                         out: paddle.Tensor) -> None:
+     """
+     Do a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
+     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
+     RHS and RHS scaling factors are required to be transposed.
+     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
+-        this function will do a transposing with a set of slow PyTorch operations.
+        this function will do a transposing with a set of slow paddle operations.
+ 
+     Arguments:
+-        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
+        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m, k]`,
+              the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
+-        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
+        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[n, k]`.
+              the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`.
+         out: the BF16 output tensor of shape `[m, n]`, representing the result.
+     """
+@@ -189,22 +189,22 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+     n, k_ = rhs.shape
+     m_, n_ = out.shape
+ 
+-    assert n % 64 == 0 and k % 128 == 0
+    # assert n % 64 == 0 and k % 128 == 0
+ 
+     # Type and shape checks
+-    assert m == m_ and n == n_ and k == k_
+-    assert n > 0 and k > 0
+-    assert lhs_scales.shape == (m, (k + 127) // 128)
+-    assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
+-    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
+-    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
+-    assert out.dtype == torch.bfloat16
+-    assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
+    # assert m == m_ and n == n_ and k == k_
+    # assert n > 0 and k > 0
+    # assert lhs_scales.shape == (m, (k + 127) // 128)
+    # assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
+    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+    # assert out.dtype == paddle.bfloat16
+    # assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
+ 
+     # LHS scales must be transposed for TMA load, but not for RHS scales
+     # NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
+     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
+-    assert rhs_scales.is_contiguous()
+    # assert rhs_scales.is_contiguous()
+ 
+     # Do nothing if `m` is zero
+     if m == 0:
+@@ -214,7 +214,7 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+     global includes, template
+     num_sms = get_num_sms()
+     num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms)
+-    args = (lhs, lhs_scales, rhs, rhs_scales, out, m, torch.cuda.current_stream(), num_sms, smem_config[0])
+    args = (lhs, lhs_scales, rhs, rhs_scales, out, m, paddle.device.cuda.current_stream(), num_sms, smem_config[0])
+     runtime = jit_tuner.compile_and_tune(
+         name='gemm_fp8_fp8_bf16_nt',
+         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
+@@ -225,10 +225,10 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
+               'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
+         space=(),
+         includes=includes,
+-        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
+-                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
+-                  ('out', torch.bfloat16), ('m', int),
+-                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+                  ('out', paddle.bfloat16), ('m', int),
+                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+         template=template,
+         args=args
+     )
+diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py
+index 3b518c9..ba776bd 100644
+--- a/deep_gemm/jit_kernels/m_grouped_gemm.py
+++ b/deep_gemm/jit_kernels/m_grouped_gemm.py
+@@ -1,4 +1,4 @@
+-import torch
+import paddle
+ from typing import Tuple
+ 
+ from .gemm import get_best_configs, get_block_n_padding_for_smem_d
+@@ -37,25 +37,25 @@ gemm_t::run(out, rhs_scales, grouped_layout,
+ """
+ 
+ 
+-def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor],
+-                                              rhs: Tuple[torch.Tensor, torch.Tensor],
+-                                              out: torch.Tensor, m_indices: torch.Tensor) -> None:
+def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+                                              rhs: Tuple[paddle.Tensor, paddle.Tensor],
+                                              out: paddle.Tensor, m_indices: paddle.Tensor) -> None:
+     """
+     Do a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
+     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
+     RHS and RHS scaling factors are required to be transposed.
+     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
+-        this function will do a transposing with a set of slow PyTorch operations.
+        this function will do a transposing with a set of slow Pypaddle operations.
+     On the M axis, inputs are grouped into several batches, of which batch sizes aligned to
+         `get_m_alignment_for_contiguous_layout()` (128).
+ 
+     Arguments:
+-        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`,
+        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m_sum, k]`,
+              the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`.
+-        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+              the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
+         out: the BF16 output tensor of shape `[m_sum, n]`, representing the result.
+-        m_indices: a tensor of shape `[m_sum]` with type `torch.int`.
+        m_indices: a tensor of shape `[m_sum]` with type `paddle.int`.
+             `m_indices[i]` records the group which the i-th row of the LHS belong to,
+             which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`.
+             Values of `m_indices` in every-m-alignment-block must also be the same.
+@@ -68,19 +68,19 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
+     m__ = m_indices.numel()
+ 
+     # Type and shape checks
+-    assert m == m_ == m__ and k == k_ and n == n_
+-    assert lhs_scales.shape == (m, (k + 127) // 128)
+-    assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+-    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
+-    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
+-    assert out.dtype == torch.bfloat16
+-    assert m_indices.dtype == torch.int32
+-    assert lhs.is_contiguous() and rhs.is_contiguous()
+-    assert out.is_contiguous() and m_indices.is_contiguous()
+    # assert m == m_ == m__ and k == k_ and n == n_
+    # assert lhs_scales.shape == (m, (k + 127) // 128)
+    # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+    # assert out.dtype == paddle.bfloat16
+    # assert m_indices.dtype == paddle.int32
+    # assert lhs.is_contiguous() and rhs.is_contiguous()
+    # assert out.is_contiguous() and m_indices.is_contiguous()
+ 
+     # LHS scales must be transposed for TMA load, but not for RHS scales
+     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
+-    assert rhs_scales.is_contiguous()
+    # assert rhs_scales.is_contiguous()
+ 
+     # Do nothing if `m` is zero
+     if m == 0:
+@@ -92,7 +92,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
+     num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms, is_grouped_contiguous=True)
+     args = (lhs, lhs_scales, rhs, rhs_scales, out,
+             m_indices, m, num_groups,
+-            torch.cuda.current_stream(), num_sms, smem_config[0])
+            paddle.device.cuda.current_stream(), num_sms, smem_config[0])
+     runtime = jit_tuner.compile_and_tune(
+         name='m_grouped_gemm_fp8_fp8_bf16_nt',
+         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
+@@ -105,11 +105,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
+               'GEMM_TYPE': 'GroupedContiguous'},
+         space=(),
+         includes=includes,
+-        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
+-                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
+-                  ('out', torch.bfloat16),
+-                  ('grouped_layout', torch.int32), ('m', int), ('num_groups', int),
+-                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+                  ('out', paddle.bfloat16),
+                  ('grouped_layout', paddle.int32), ('m', int), ('num_groups', int),
+                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+         template=template,
+         args=args
+     )
+@@ -118,22 +118,22 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
+     runtime(*args)
+ 
+ 
+-def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor],
+-                                          rhs: Tuple[torch.Tensor, torch.Tensor],
+-                                          out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None:
+def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[paddle.Tensor, paddle.Tensor],
+                                          rhs: Tuple[paddle.Tensor, paddle.Tensor],
+                                          out: paddle.Tensor, masked_m: paddle.Tensor, expected_m: int) -> None:
+     """
+     Do a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
+     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
+     RHS and RHS scaling factors are required to be transposed.
+     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
+-        this function will do a transposing with a set of slow PyTorch operations.
+        this function will do a transposing with a set of slow paddle operations.
+     Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch
+         should be separately transposed.
+ 
+     Arguments:
+-        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
+        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
+              the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`.
+-        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
+              the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
+         out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result.
+         masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute
+@@ -149,21 +149,21 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
+     num_groups___ = masked_m.numel()
+ 
+     # Type and shape checks
+-    assert num_groups == num_groups_ == num_groups__ == num_groups___
+-    assert m == m_ and n == n_ and k == k_
+-    assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
+-    assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
+-    assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+-    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
+-    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
+-    assert out.dtype == torch.bfloat16
+-    assert masked_m.dtype == torch.int32
+-    assert lhs.is_contiguous() and rhs.is_contiguous()
+-    assert out.is_contiguous() and masked_m.is_contiguous()
+    # assert num_groups == num_groups_ == num_groups__ == num_groups___
+    # assert m == m_ and n == n_ and k == k_
+    # assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
+    # assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
+    # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
+    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
+    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
+    # assert out.dtype == paddle.bfloat16
+    # assert masked_m.dtype == paddle.int32
+    # assert lhs.is_contiguous() and rhs.is_contiguous()
+    # assert out.is_contiguous() and masked_m.is_contiguous()
+ 
+     # LHS scales must be transposed for TMA load, but not for RHS scales
+     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
+-    assert rhs_scales.is_contiguous()
+    # assert rhs_scales.is_contiguous()
+ 
+     # Auto-tuning with compilation
+     global includes, template
+@@ -176,7 +176,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
+ 
+     args = (lhs, lhs_scales, rhs, rhs_scales, out,
+             masked_m, m,
+-            torch.cuda.current_stream(), num_sms, smem_config[0])
+            paddle.device.cuda.current_stream(), num_sms, smem_config[0])
+     runtime = jit_tuner.compile_and_tune(
+         name='m_grouped_gemm_fp8_fp8_bf16_nt',
+         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
+@@ -189,11 +189,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
+               'GEMM_TYPE': 'GroupedMasked'},
+         space=(),
+         includes=includes,
+-        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
+-                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
+-                  ('out', torch.bfloat16),
+-                  ('grouped_layout', torch.int32), ('m', int),
+-                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
+                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
+                  ('out', paddle.bfloat16),
+                  ('grouped_layout', paddle.int32), ('m', int),
+                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
+         template=template,
+         args=args
+     )
+diff --git a/deep_gemm/jit_kernels/tuner.py b/deep_gemm/jit_kernels/tuner.py
+index 6ed6749..9e1d70f 100644
+--- a/deep_gemm/jit_kernels/tuner.py
+++ b/deep_gemm/jit_kernels/tuner.py
+@@ -1,6 +1,6 @@
+ import copy
+ import os
+-import torch
+import paddle
+ from typing import Any, Dict
+ 
+ from ..jit import build, cpp_format, generate, Runtime
+@@ -51,10 +51,10 @@ class JITTuner:
+                     continue
+ 
+                 # Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
+-                start_event = torch.cuda.Event(enable_timing=True)
+-                end_event = torch.cuda.Event(enable_timing=True)
+-                torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
+-                torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                start_event = paddle.device.cuda.Event(enable_timing=True)
+                end_event = paddle.device.cuda.Event(enable_timing=True)
+                paddle.empty((int(256e6 // 4)), dtype=paddle.int32).zero_()
+                paddle.randn((8192, 8192), dtype=paddle.float32) @ paddle.randn((8192, 8192), dtype=paddle.float32)
+                 start_event.record()
+                 for i in range(20):
+                     assert runtime(*args) == 0
+diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py
+index c6da56b..a17b1b1 100644
+--- a/deep_gemm/jit_kernels/utils.py
+++ b/deep_gemm/jit_kernels/utils.py
+@@ -1,4 +1,4 @@
+-import torch
+import paddle
+ 
+ _num_sms = None
+ 
+@@ -11,7 +11,7 @@ def set_num_sms(num_sms: int) -> None:
+         num_sms: the desired maximum SM count for all GEMM kernels to use.
+     """
+     global _num_sms
+-    assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count
+    assert 0 < num_sms <= paddle.device.cuda.get_device_properties().multi_processor_count
+     _num_sms = num_sms
+ 
+ 
+@@ -25,7 +25,7 @@ def get_num_sms() -> int:
+     """
+     global _num_sms
+     if _num_sms is None:
+-        _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
+        _num_sms = paddle.device.cuda.get_device_properties().multi_processor_count
+     return _num_sms
+ 
+ 
+@@ -74,9 +74,9 @@ def get_tma_aligned_size(x: int, element_size: int) -> int:
+     return ceil_div(x, alignment) * alignment
+ 
+ 
+-def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
+def get_col_major_tma_aligned_tensor(x: paddle.Tensor) -> paddle.Tensor:
+     """
+-    Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary.
+    Returns TMA-aligned transposed format of the input tensor. `paddle.transpose` will be called if necessary.
+     If the input tensor is already column-major layout and 16-byte aligned along the M axis
+         (thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing.
+ 
+@@ -92,18 +92,20 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
+     m, n = x.shape[-2], x.shape[-1]
+     aligned_m = get_tma_aligned_size(m, x.element_size())
+     if x.dim() == 2:
+-        if x.stride(0) == 1 and x.stride(1) == aligned_m:
+        if x.strides[0] == 1 and x.strides[1] == aligned_m:
+             return x
+         x, remove_dim = x.unsqueeze(0), True
+ 
+     b = x.shape[0]
+ 
+     # The last kernel gives a column-major TMA aligned layout
+-    if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m:
+    if x.strides[0] == aligned_m * n and x.strides[1] == 1 and x.strides[2] == aligned_m:
+         return x.squeeze(0) if remove_dim else x
+ 
+     # Normal layout requires transposing
+-    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x = paddle.transpose(
+        paddle.empty((b, n, aligned_m), dtype=x.dtype), perm=[0, 2, 1]
+    )
+     aligned_x[:, :m, :] = x
+     aligned_x = aligned_x[:, :m, :]
+     return aligned_x.squeeze(0) if remove_dim else aligned_x
+diff --git a/deep_gemm/paddle_utils.py b/deep_gemm/paddle_utils.py
+new file mode 100644
+index 0000000..2326807
+--- /dev/null
+++ b/deep_gemm/paddle_utils.py
+@@ -0,0 +1,20 @@
+import os
+
+def get_cuda_home():
+    """Get Cuda home directory"""
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
+    if cuda_home:
+        return cuda_home
+
+    try:
+        which_cmd = "which nvcc"
+
+        nvcc_path = os.popen(which_cmd).read().strip()
+        if nvcc_path:
+            return os.path.dirname(os.path.dirname(nvcc_path))
+    except Exception:
+        pass
+
+    return None
+
+CUDA_HOME = get_cuda_home()
+\ No newline at end of file
+diff --git a/deep_gemm/utils.py b/deep_gemm/utils.py
+index d5cdd01..5237f09 100644
+--- a/deep_gemm/utils.py
+++ b/deep_gemm/utils.py
+@@ -1,15 +1,15 @@
+ import os
+ import sys
+ import time
+-import torch
+-import torch.distributed as dist
+import paddle
+import paddle.distributed as dist
+ 
+ 
+ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+           high_precision: bool = False):
+     # Flush L2 cache with 256 MB data
+-    torch.cuda.synchronize()
+-    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    paddle.device.cuda.synchronize()
+    cache = paddle.empty((int(256e6 // 4)), dtype=paddle.int32)
+     cache.zero_()
+ 
+     # Warmup
+@@ -18,18 +18,18 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
+ 
+     # Add a large kernel to eliminate the CPU launch overhead
+     if high_precision:
+-        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+-        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+        x = paddle.randn((8192, 8192), dtype=paddle.float32)
+        y = paddle.randn((8192, 8192), dtype=paddle.float32)
+         x @ y
+ 
+     # Testing
+-    start_event = torch.cuda.Event(enable_timing=True)
+-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = paddle.device.cuda.Event(enable_timing=True)
+    end_event = paddle.device.cuda.Event(enable_timing=True)
+     start_event.record()
+     for i in range(num_tests):
+         fn()
+     end_event.record()
+-    torch.cuda.synchronize()
+    paddle.device.synchronize()
+ 
+     return start_event.elapsed_time(end_event) / num_tests
+ 
+@@ -106,21 +106,21 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
+     # Profile
+     suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
+     with suppress():
+-        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
+-        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
+        scheduler = paddle.profiler.make_scheduler(closed=0, ready=1, record=1, repeat=1) if not using_nsys else None
+        profiler = paddle.profiler.Profiler(targets=[paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU], scheduler=scheduler) if not using_nsys else empty_suppress()
+         with profiler:
+             for i in range(2):
+                 # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
+                 if barrier_comm_profiling:
+-                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+-                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
+                    lhs = paddle.randn((8192, 8192), dtype=paddle.float32)
+                    rhs = paddle.randn((8192, 8192), dtype=paddle.float32)
+                     lhs @ rhs
+-                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
+                    dist.all_reduce(paddle.ones(1, dtype=paddle.float32))
+                 for _ in range(num_tests):
+                     if sleep_between_tests > 0.0:
+                         time.sleep(sleep_between_tests)
+                     if flush_l2:
+-                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
+                        paddle.empty(flush_l2_size, dtype=paddle.int32).zero_()
+                     fn()
+ 
+                 if not using_nsys:
+-- 
+2.43.0
+
--- a/custom_ops/cpu_ops/avx_weight_only.cc
+++ b/custom_ops/cpu_ops/avx_weight_only.cc
@@ -1,188 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "dtype.h"
-#include "matmul_helper.h"
-#include "my_types.h"
-#include "paddle/extension.h"
-#include "paddle/phi/core/kernel_registry.h"
-template <typename T>
-void AvxCompute(const paddle::Tensor &x,
-                const paddle::Tensor &weight,
-                const paddle::Tensor &w_bias,
-                bool trans,
-                const std::string alog,
-                paddle::Tensor &out,
-                xft::Matrix<T> &quantizedWeight,
-                xft::Vector<float> &WeightScale,
-                xft::Vector<float> &WeightZero,
-                xft::Vector<float> &WeightSum,
-                MMHelper *mmHelper) {
-    auto out_data = out.data<float>();
-    const float *x_data = reinterpret_cast<const float *>(x.data<float>());
-    const float *bias_data = nullptr;
-    if (w_bias.initialized()) {
-        bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
-    }
-    int m = 1;
-    for (int i = 0; i < x.shape().size() - 1; i++) {
-        m = m * x.shape()[i];
-    }
-    int k = x.shape()[x.shape().size() - 1];
-    int l = weight.shape()[1];
-    int n = weight.shape()[1];
-    if (w_bias.initialized()) {
-        mmHelper->compute_bias(false,
-                               m,
-                               n,
-                               k,
-                               1.0f,
-                               x_data,
-                               k,
-                               quantizedWeight.Data(),
-                               WeightScale.Data(),
-                               WeightZero.Data(),
-                               WeightSum.Data(),
-                               0.0f,
-                               out_data,
-                               l,
-                               bias_data);
-    } else {
-        mmHelper->compute(false,
-                          m,
-                          n,
-                          k,
-                          1.0f,
-                          x_data,
-                          k,
-                          quantizedWeight.Data(),
-                          WeightScale.Data(),
-                          WeightZero.Data(),
-                          WeightSum.Data(),
-                          0.0,
-                          out_data,
-                          l);
-    }
-};
-template <typename T>
-void AvxWeightOnly(const paddle::Tensor &x,
-                   const paddle::Tensor &weight,
-                   const paddle::Tensor &w_bias,
-                   bool trans,
-                   const std::string alog,
-                   paddle::Tensor &out) {
-    static std::unordered_map<std::string,
-                              std::tuple<xft::Matrix<T> *,
-                                         xft::Vector<float> *,
-                                         xft::Vector<float> *,
-                                         xft::Vector<float> *>>
-        weight_only_hub;
-    std::stringstream weights_addr;
-    weights_addr << weight.data<float>() << alog;
-    std::string weight_only_key = weights_addr.str();
-    auto it_created = weight_only_hub.find(weight_only_key);
-    static MMHelper *mmHelper;
-    int rows = weight.shape()[0], cols = weight.shape()[1];
-    xft::Vector<float> *WeightScale =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Vector<float> *WeightZero =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Vector<float> *WeightSum =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
-    if (it_created == weight_only_hub.end()) {
-        auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
-        xft::Matrix<T> convertedWeight;
-        mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
-        mmHelper->convertWeight(trans,
-                                rows,
-                                cols,
-                                weight_ptr,
-                                nullptr,
-                                nullptr,
-                                convertedWeight,
-                                *WeightScale,
-                                *WeightZero,
-                                *WeightSum);
-        quantizedWeight->Resize(rows, cols);
-        mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
-        weight_only_hub[weight_only_key] = std::make_tuple(
-            quantizedWeight, WeightScale, WeightZero, WeightSum);
-        AvxCompute<T>(x,
-                      weight,
-                      w_bias,
-                      trans,
-                      alog,
-                      out,
-                      *quantizedWeight,
-                      *WeightScale,
-                      *WeightZero,
-                      *WeightSum,
-                      mmHelper);
-    } else {
-        AvxCompute<T>(x,
-                      weight,
-                      w_bias,
-                      trans,
-                      alog,
-                      out,
-                      *(std::get<0>(it_created->second)),
-                      *(std::get<1>(it_created->second)),
-                      *(std::get<2>(it_created->second)),
-                      *(std::get<3>(it_created->second)),
-                      mmHelper);
-    }
-}
-std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
-                                                const paddle::Tensor &weight,
-                                                const paddle::Tensor &w_bias,
-                                                const std::string &alog,
-                                                bool trans) {
-    auto out_shape = x.shape();
-    out_shape[out_shape.size() - 1] = weight.shape()[1];
-    auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
-    if (alog == "int8") {
-        AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
-    } else if (alog == "fp16") {
-        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
-    } else {
-        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
-    }
-    return {out};
-}
-
-std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
-    std::vector<int64_t> x_shape,
-    std::vector<int64_t> weigh_shape,
-    std::vector<int64_t> weigh_bias_shape) {
-    int m = 1;
-    for (int i = 0; i < x_shape.size() - 1; i++) {
-        m = m * x_shape[i];
-    }
-    return {std::vector<int64_t>{m, weigh_shape[1]}};
-}
-
-std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
-    paddle::DataType x_dtype,
-    paddle::DataType weight_dtype,
-    paddle::DataType weight_bias_dtype) {
-    return {x_dtype};
-}
-
-PD_BUILD_STATIC_OP(avx_weight_only)
-    .Inputs({"x", "weight", "w_bias"})
-    .Outputs({"out"})
-    .Attrs({"alog: std::string", "trans:bool"})
-    .SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
-    .SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/extension.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+template <typename T>
+void RebuildPaddingCPUImpl(T *output_data,
+                           const T *input_data,
+                           const int *cum_offsets_data,
+                           const int *seq_len_this_time_data,
+                           const int *seq_lens_decoder_data,
+                           const int *seq_lens_encoder_data,
+                           int max_input_length,
+                           int dim_embed,
+                           const int elem_nums) {
+    for (int i = 0; i < elem_nums; ++i) {
+        const int bi = i / dim_embed;
+        const int bias_idx = i % dim_embed;
+        int seq_id = 0;
+
+        if (seq_len_this_time_data[bi] == 0) {
+            continue;
+        }
+        if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
+            continue;
+        }
+        if (seq_lens_encoder_data[bi] > 0) {
+            seq_id = seq_lens_encoder_data[bi] - 1;
+        }
+        const int ori_token_idx =
+            bi * max_input_length - cum_offsets_data[bi] + seq_id;
+        const int src_offset = ori_token_idx * dim_embed + bias_idx;
+
+        output_data[i] = input_data[src_offset];
+    }
+}
+
+template <typename T>
+void RebuildAppendPaddingCPUImpl(T *output_data,
+                                 const T *input_data,
+                                 const int *cum_offsets_data,
+                                 const int *seq_len_this_time_data,
+                                 const int *seq_lens_decoder_data,
+                                 const int *seq_lens_encoder_data,
+                                 const int *output_padding_offset_data,
+                                 const int max_input_length,
+                                 const int dim_embed,
+                                 const int64_t output_elem_nums) {
+    for (int i = 0; i < output_elem_nums; ++i) {
+        int out_token_id = i / dim_embed;
+        int ori_token_id =
+            out_token_id + output_padding_offset_data[out_token_id];
+        int bi = ori_token_id / max_input_length;
+        if (seq_len_this_time_data[bi] == 0 ||
+            (seq_lens_decoder_data[bi] == 0 &&
+             seq_lens_encoder_data[bi] == 0)) {
+            continue;
+        }
+        int seq_id = 0;
+        if (seq_lens_encoder_data[bi] > 0) {
+            seq_id = seq_lens_encoder_data[bi] - 1;
+        }
+        int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
+        int bias_idx = i % dim_embed;
+        int src_offset = input_token_id * dim_embed + bias_idx;
+        output_data[i] = input_data[src_offset];
+    }
+}
+
+std::vector<paddle::Tensor> RebuildPaddingCPU(
+    const paddle::Tensor &tmp_out,
+    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &seq_len_this_time,
+    const paddle::Tensor &seq_lens_decoder,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::optional<paddle::Tensor> &output_padding_offset,
+    int max_input_length) {
+    auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
+    auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
+    auto seq_len_this_time_cpu =
+        seq_len_this_time.copy_to(paddle::CPUPlace(), true);
+    auto seq_lens_decoder_cpu =
+        seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
+    auto seq_lens_encoder_cpu =
+        seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
+    paddle::optional<paddle::Tensor> output_padding_offset_cpu;
+    if (output_padding_offset) {
+        output_padding_offset_cpu =
+            output_padding_offset->copy_to(paddle::CPUPlace(), true);
+    }
+
+    int token_num = tmp_out_cpu.shape()[0];
+    int dim_embed = tmp_out_cpu.shape()[1];
+    int bsz = cum_offsets_cpu.shape()[0];
+
+    paddle::Tensor out;
+    if (output_padding_offset_cpu) {
+        int need_delete_token_num = 0;
+        for (int i = 0; i < bsz; ++i) {
+            if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
+                need_delete_token_num +=
+                    seq_lens_encoder_cpu.data<int>()[i] - 1;
+            }
+        }
+        int output_token_num = token_num - need_delete_token_num;
+        out = paddle::full({output_token_num, dim_embed},
+                           0,
+                           tmp_out_cpu.dtype(),
+                           paddle::CPUPlace());
+    } else {
+        out = paddle::full(
+            {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
+    }
+
+    const int *cum_offsets_data = cum_offsets_cpu.data<int>();
+    const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
+    const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
+    const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
+    int elem_nums = out.numel();
+
+    if (output_padding_offset_cpu) {
+        const int *output_padding_offset_data =
+            output_padding_offset_cpu->data<int>();
+        switch (tmp_out_cpu.dtype()) {
+            case paddle::DataType::FLOAT32:
+                RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
+                                                   tmp_out_cpu.data<float>(),
+                                                   cum_offsets_data,
+                                                   seq_len_this_time_data,
+                                                   seq_lens_decoder_data,
+                                                   seq_lens_encoder_data,
+                                                   output_padding_offset_data,
+                                                   max_input_length,
+                                                   dim_embed,
+                                                   elem_nums);
+                break;
+            case paddle::DataType::FLOAT16:
+                RebuildAppendPaddingCPUImpl<paddle::float16>(
+                    out.data<paddle::float16>(),
+                    tmp_out_cpu.data<paddle::float16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    output_padding_offset_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            case paddle::DataType::BFLOAT16:
+                RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
+                    out.data<paddle::bfloat16>(),
+                    tmp_out_cpu.data<paddle::bfloat16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    output_padding_offset_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            default:
+                PD_THROW(
+                    "Unsupported data type for rebuild_padding_cpu. "
+                    "Only float32, float16, and bfloat16 are supported.");
+        }
+    } else {
+        switch (tmp_out_cpu.dtype()) {
+            case paddle::DataType::FLOAT32:
+                RebuildPaddingCPUImpl<float>(out.data<float>(),
+                                             tmp_out_cpu.data<float>(),
+                                             cum_offsets_data,
+                                             seq_len_this_time_data,
+                                             seq_lens_decoder_data,
+                                             seq_lens_encoder_data,
+                                             max_input_length,
+                                             dim_embed,
+                                             elem_nums);
+                break;
+            case paddle::DataType::FLOAT16:
+                RebuildPaddingCPUImpl<paddle::float16>(
+                    out.data<paddle::float16>(),
+                    tmp_out_cpu.data<paddle::float16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            case paddle::DataType::BFLOAT16:
+
+                RebuildPaddingCPUImpl<paddle::bfloat16>(
+                    out.data<paddle::bfloat16>(),
+                    tmp_out_cpu.data<paddle::bfloat16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            default:
+                PD_THROW(
+                    "Unsupported data type for rebuild_padding_cpu. "
+                    "Only float32, float16, and bfloat16 are supported.");
+        }
+    }
+    return {out};
+}
+
+std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
+    const std::vector<int64_t> &tmp_out_shape,
+    const std::vector<int64_t> &cum_offsets_shape,
+    const std::vector<int64_t> &seq_len_this_time_shape,
+    const std::vector<int64_t> &seq_lens_decoder_shape,
+    const std::vector<int64_t> &seq_lens_encoder_shape,
+    const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
+    int64_t dim_embed = tmp_out_shape[1];
+    if (output_padding_offset_shape) {
+        return {{-1, dim_embed}};
+    } else {
+        int64_t bsz = cum_offsets_shape[0];
+        return {{bsz, dim_embed}};
+    }
+}
+
+std::vector<paddle::DataType> RebuildPaddingInferDtype(
+    const paddle::DataType &tmp_out_dtype,
+    const paddle::DataType &cum_offsets_dtype,
+    const paddle::DataType &seq_len_this_time_dtype,
+    const paddle::DataType &seq_lens_decoder_dtype,
+    const paddle::DataType &seq_lens_encoder_dtype,
+    const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
+    return {tmp_out_dtype};
+}
+
+PD_BUILD_STATIC_OP(rebuild_padding_cpu)
+    .Inputs({"tmp_out",
+             "cum_offsets",
+             "seq_len_this_time",
+             "seq_lens_decoder",
+             "seq_lens_encoder",
+             paddle::Optional("output_padding_offset")})
+    .Outputs({"out"})
+    .Attrs({"max_input_length: int"})
+    .SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
+    .SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));
--- a/custom_ops/cpu_ops/xft_all_layer.cc
+++ b/custom_ops/cpu_ops/xft_all_layer.cc
@@ -1,201 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "layers_decoder.h"
-#include "paddle/extension.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-std::vector<paddle::Tensor> InvokeAllLLaMALayer(
-    const paddle::Tensor &input,
-    const std::vector<paddle::Tensor> &ln1Gamma,
-    const std::vector<paddle::Tensor> &ln1Beta,
-    const std::vector<paddle::Tensor> &qkvWeight,
-    const std::vector<paddle::Tensor> &qkvBiasWeight,
-    const std::vector<paddle::Tensor> &attnOutWeight,
-    const std::vector<paddle::Tensor> &attnOutBias,
-    const std::vector<paddle::Tensor> &ln2Gamma,
-    const std::vector<paddle::Tensor> &ln2Beta,
-    const std::vector<paddle::Tensor> &gateWeight,
-    const std::vector<paddle::Tensor> &gateBias,
-    const std::vector<paddle::Tensor> &upWeight,
-    const std::vector<paddle::Tensor> &upBias,
-    const std::vector<paddle::Tensor> &downWeight,
-    const std::vector<paddle::Tensor> &downBias,
-    const paddle::Tensor &pastSeqLen,
-    const paddle::Tensor &currentSeqLen,
-    const paddle::Tensor &step,
-    int hiddensize,
-    int totalLayer,
-    const std::string &computeType,
-    const std::string &activation,
-    const std::string &normType,
-    int attHeadDim,
-    int attHeadNum,
-    int kvHeadNum,
-    int maxPositions,
-    int maxPosEmbed,
-    int intermediateSize) {
-    auto out = paddle::empty_like(input);
-    auto batchSize = input.shape()[0];
-    auto inputSeqLen = input.shape()[1];
-    auto past_seq_len = pastSeqLen.data<int64_t>()[0];
-    auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
-    auto step_id = step.data<int64_t>()[0];
-    auto output_ptr = reinterpret_cast<void *>(out.data<float>());
-    auto xft_data_type = xft::DataType::fp16;
-    if (computeType == "bf16") {
-        xft_data_type = xft::DataType::bf16;
-    } else if (computeType == "bf16_int8") {
-        xft_data_type = xft::DataType::bf16_int8;
-    }
-    auto xft_act_type = xft::ActivationType::SILU;
-    if (activation == "relu") {
-        xft_act_type = xft::ActivationType::RELU;
-    } else if (activation == "gelu") {
-        xft_act_type = xft::ActivationType::GELU;
-    } else if (activation == "swiglu") {
-        xft_act_type = xft::ActivationType::SWIGLU;
-    }
-    auto xft_norm_type = xft::NormType::RMS;
-    if (normType == "layernorm") {
-        xft_norm_type = xft::NormType::LN;
-    }
-    auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
-    for (int i = 0; i < totalLayer; ++i) {
-        auto ln1Gamma_ptr =
-            reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
-        auto ln1Beta_ptr =
-            reinterpret_cast<const float *>(ln1Beta[i].data<float>());
-        auto qkvWeight_ptr =
-            reinterpret_cast<const void *>(qkvWeight[i].data<float>());
-        auto qkvBiasWeight_ptr =
-            reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
-        auto attnOutWeight_ptr =
-            reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
-        auto ln2Gamma_ptr =
-            reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
-        auto ln2Beta_ptr =
-            reinterpret_cast<const float *>(ln2Beta[i].data<float>());
-        auto gate_weight_ptr =
-            reinterpret_cast<const void *>(gateWeight[i].data<float>());
-        auto up_weight_ptr =
-            reinterpret_cast<const void *>(upWeight[i].data<float>());
-        auto down_weight_ptr =
-            reinterpret_cast<const void *>(downWeight[i].data<float>());
-        auto gate_bias_ptr =
-            reinterpret_cast<const float *>(gateBias[i].data<float>());
-        auto up_bias_ptr =
-            reinterpret_cast<const float *>(upBias[i].data<float>());
-        auto down_bias_ptr =
-            reinterpret_cast<const float *>(downBias[i].data<float>());
-        auto attnOutBias_ptr =
-            reinterpret_cast<const float *>(attnOutBias[i].data<float>());
-        invokeLayerLLaMA(
-            xft_data_type,                         // dt
-            xft_act_type,                          // at
-            xft_norm_type,                         // nt
-            i,                                     // layerId
-            totalLayer,                            // totalLayers
-            batchSize,                             // batchSize
-            inputSeqLen,                           // inputSeqLen
-            attHeadDim,                            // attHeadDim
-            attHeadNum,                            // attHeadNum
-            kvHeadNum,                             // kvHeadNum
-            maxPositions,                          // maxPositions
-            maxPosEmbed,                           // maxPosEmbed
-            past_seq_len,                          // pastSeqLen
-            cur_seq_len,                           // currentSeqLen
-            step_id,                               // step
-            hiddensize,                            // hiddenSize
-            intermediateSize,                      // intermediateSize
-            reinterpret_cast<void *>(output_ptr),  // output
-            hiddensize,                            // outputStride
-            input_ptr,                             // input
-            hiddensize,                            // inputStride
-            ln1Gamma_ptr,                          // ln1Gamma
-            ln1Beta_ptr,                           // ln1Beta
-            qkvWeight_ptr,                         // queryWeight
-            qkvWeight_ptr + hiddensize,            // keyWeight
-            qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim,  // valueWeight
-            attnOutWeight_ptr,  // attnOutWeight
-            ln2Gamma_ptr,       // ln2Gamma
-            ln2Beta_ptr,        // ln2Beta
-            gate_weight_ptr,
-            up_weight_ptr,
-            down_weight_ptr,
-            qkvBiasWeight_ptr,               // queryBias
-            qkvBiasWeight_ptr + hiddensize,  // keyBias
-            qkvBiasWeight_ptr + hiddensize +
-                kvHeadNum * attHeadDim,  // valueBias
-            attnOutBias_ptr,             // attnOutBias
-            qkvWeight_ptr,               // myqkvWeight
-            gate_bias_ptr,
-            up_bias_ptr,
-            down_bias_ptr,
-            qkvBiasWeight_ptr);
-        if (i < totalLayer - 1) {
-            memcpy(const_cast<void *>(input_ptr),
-                   output_ptr,
-                   batchSize * inputSeqLen * hiddensize * sizeof(float));
-        }
-    }
-    return {out};
-}
-
-std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
-    std::vector<int64_t> x_shape) {
-    return {x_shape};
-}
-
-std::vector<paddle::DataType> AllLLaMALayerInferDtype(
-    paddle::DataType x_dtype) {
-    return {x_dtype};
-}
-
-PD_BUILD_STATIC_OP(xft_llama_all_layer)
-    .Inputs({
-        "x",
-        paddle::Vec("ln1Gamma"),
-        paddle::Vec("ln1Beta"),
-        paddle::Vec("qkvWeight"),
-        paddle::Vec("qkvBiasWeight"),
-        paddle::Vec("attnOutWeight"),
-        paddle::Vec("attnOutBias"),
-        paddle::Vec("ln2Gamma"),
-        paddle::Vec("ln2Beta"),
-        paddle::Vec("gateWeight"),
-        paddle::Vec("gateBias"),
-        paddle::Vec("upWeight"),
-        paddle::Vec("upBias"),
-        paddle::Vec("downWeight"),
-        paddle::Vec("downBias"),
-        "pastSeqLen",
-        "currentSeqLen",
-        "step",
-    })
-    .Outputs({"out"})
-    .Attrs({"hiddensize :int",
-            "totalLayer :int",
-            "computeType : std::string",
-            "activation :std::string",
-            "normType :std::string",
-            "attHeadDim: int",
-            "attHeadNum: int",
-            "kvHeadNum: int",
-            "maxPositions: int",
-            "maxPosEmbed: int",
-            "intermediateSize: int"})
-    .SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
-    .SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));
--- a/custom_ops/cpu_ops/xft_greedy_search.cc
+++ b/custom_ops/cpu_ops/xft_greedy_search.cc
@@ -1,126 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <omp.h>
-#include <cstdio>
-#include <iostream>
-#include "paddle/extension.h"
-
-void greedy_search(const float *probs,
-                   int64_t *next_token_ids,
-                   int bsz,
-                   int vocab_size) {
-    int numThreads = 0;
-#pragma omp parallel
-    {
-        int tid = omp_get_thread_num();
-        if (tid == 0) {
-            numThreads = omp_get_num_threads();
-        }
-    }
-    float maxVals[bsz];
-
-    // Small batch size (each sample can have at least 2 threads)
-    if (numThreads / bsz >= 2) {
-        int thrPerSample = numThreads / bsz;
-        int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
-        int maxIndices[bsz * thrPerSample];
-        float maxValues[bsz * thrPerSample];
-
-        // TODO: if size is small, possible to cause out of boundary
-#pragma omp parallel for collapse(2)
-        for (int b = 0; b < bsz; ++b) {
-            for (int t = 0; t < thrPerSample; ++t) {
-                int start = t * sizePerThr;
-                int end = (start + sizePerThr) > vocab_size
-                              ? vocab_size
-                              : (start + sizePerThr);
-                const float *p = probs + b * vocab_size;
-                int maxIdx = start;
-                float maxVal = p[start];
-                for (int off = start + 1; off < end; ++off) {
-                    if (p[off] > maxVal) {
-                        maxVal = p[off];
-                        maxIdx = off;
-                    }
-                }
-
-                // False sharing happens, but since only one time, not avoided
-                maxIndices[b * thrPerSample + t] = maxIdx;
-                maxValues[b * thrPerSample + t] = maxVal;
-            }
-        }
-
-        // Local reduction
-        for (int i = 0; i < bsz; ++i) {
-            int *pIndices = maxIndices + i * thrPerSample;
-            float *pValues = maxValues + i * thrPerSample;
-            int maxIdx = pIndices[0];
-            float maxVal = pValues[0];
-            for (int j = 1; j < thrPerSample; ++j) {
-                if (pValues[j] > maxVal) {
-                    maxVal = pValues[j];
-                    maxIdx = pIndices[j];
-                }
-            }
-            next_token_ids[i] = maxIdx;
-            maxVals[i] = maxVal;
-        }
-    }
-
-    // Each thread handle one sample (one row)
-    else {
-#pragma omp parallel for
-        for (int i = 0; i < bsz; ++i) {
-            int maxId = 0;
-            const float *p = probs + i * vocab_size;
-            float maxVal = p[0];
-            for (int j = 1; j < vocab_size; ++j) {
-                if (p[j] > maxVal) {
-                    maxVal = p[j];
-                    maxId = j;
-                }
-            }
-            next_token_ids[i] = maxId;
-            maxVals[i] = maxVal;
-        }
-    }
-    return;
-}
-std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
-    const int bsz = probs.shape()[0];
-    const int vocab_size = probs.shape()[1];
-    auto next_tokens =
-        paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
-
-    greedy_search(probs.data<float>(),
-                  const_cast<int64_t *>(next_tokens.data<int64_t>()),
-                  bsz,
-                  vocab_size);
-    return {next_tokens};
-}
-std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
-    const std::vector<int64_t> &probs_shape) {
-    int64_t bsz = probs_shape[0];
-    return {{bsz, 1}};
-}
-std::vector<paddle::DataType> XftGreedySearchInferDtype(
-    const paddle::DataType &probs_dtype) {
-    return {paddle::DataType::INT64};
-}
-PD_BUILD_STATIC_OP(xft_greedy_search)
-    .Inputs({"probs"})
-    .Outputs({"next_tokens_ids"})
-    .SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
-    .SetKernelFn(PD_KERNEL(XftGreedySearch));
--- a/custom_ops/gpu_ops/air_topp_sampling.cu
+++ b/custom_ops/gpu_ops/air_topp_sampling.cu
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -17,15 +17,12 @@
 #include "paddle/phi/core/memory/memcpy.h"

 template <int THREADBLOCK_SIZE>
-__global__ void GetMaxLenKernel(const int *seq_lens,
-                                const int *seq_lens_this_time,
-                                const int *seq_lens_encoder,
-                                const int *seq_lens_this_time_merged,
-                                const int *seq_lens_encoder_merged,
-                                const int *seq_mapping,
-                                const int *system_lens,
-                                int *max_lens,
-                                const int batch_size) {
+__global__ void
+GetMaxLenKernel(const int *seq_lens, const int *seq_lens_this_time,
+                const int *seq_lens_encoder,
+                const int *seq_lens_this_time_merged,
+                const int *seq_lens_encoder_merged, const int *seq_mapping,
+                const int *system_lens, int *max_lens, const int batch_size) {
  const int tid = threadIdx.x;

  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
@@ -41,43 +38,61 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
  int max_dec_len_without_system_this_thread = 0;
  for (int i = tid; i < batch_size; i += blockDim.x) {
    const int seq_len_this_time = seq_lens_this_time[i];
-    max_len_this_time_this_thread = max(seq_len_this_time,
-                                        max_len_this_time_this_thread);
-    max_len_encoder_this_thread = max(seq_lens_encoder[i],
-                                      max_len_encoder_this_thread);
+    max_len_this_time_this_thread =
+        max(seq_len_this_time, max_len_this_time_this_thread);
+    max_len_encoder_this_thread =
+        max(seq_lens_encoder[i], max_len_encoder_this_thread);
    max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
-    if (seq_len_this_time <= 0) continue;
+    if (seq_len_this_time <= 0)
+      continue;
    const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
-    max_len_this_thread = max(seq_lens[i] + seq_len_this_time,
-                              max_len_this_thread);
-    max_just_dec_len_this_thread = max(max_just_dec_len_this_thread,
-                                       max_just_dec_len_now);
+    max_len_this_thread =
+        max(seq_lens[i] + seq_len_this_time, max_len_this_thread);
+    max_just_dec_len_this_thread =
+        max(max_just_dec_len_this_thread, max_just_dec_len_now);
    if (system_lens) {
      const int real_bid = seq_mapping[i];
      const int system_len_now = system_lens[real_bid];
-      max_system_len_this_thread = max(max_system_len_this_thread, system_len_now);
-      max_dec_len_without_system_this_thread = max(max_dec_len_without_system_this_thread,
-                                                   max_just_dec_len_now - system_len_now);
+      max_system_len_this_thread =
+          max(max_system_len_this_thread, system_len_now);
+      max_dec_len_without_system_this_thread =
+          max(max_dec_len_without_system_this_thread,
+              max_just_dec_len_now - system_len_now);
    }
  }
  if (system_lens) {
    for (int i = tid; i < batch_size; i += blockDim.x) {
      const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
-      if (ori_seq_len_this_time <= 0) continue;
-      const int max_just_dec_merged_len_this_time_now = seq_lens_encoder_merged[i] > 0 ?
-                                                        0 : ori_seq_len_this_time;
-      max_just_dec_merged_len_this_time_this_thread = max(max_just_dec_merged_len_this_time_this_thread,
-                                                          max_just_dec_merged_len_this_time_now);
+      if (ori_seq_len_this_time <= 0)
+        continue;
+      const int max_just_dec_merged_len_this_time_now =
+          seq_lens_encoder_merged[i] > 0 ? 0 : ori_seq_len_this_time;
+      max_just_dec_merged_len_this_time_this_thread =
+          max(max_just_dec_merged_len_this_time_this_thread,
+              max_just_dec_merged_len_this_time_now);
    }
  }
-  int total_max_len_this_time = BlockReduce(temp_storage).Reduce(max_len_this_time_this_thread, MaxOp<int>());
-  int total_max_len_encoder = BlockReduce(temp_storage).Reduce(max_len_encoder_this_thread, MaxOp<int>());
-  int total_max_len_decoder = BlockReduce(temp_storage).Reduce(max_len_decoder_this_thread, MaxOp<int>());
-  int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
-  int total_just_dec = BlockReduce(temp_storage).Reduce(max_just_dec_len_this_thread, MaxOp<int>());
-  int total_just_dec_merged = BlockReduce(temp_storage).Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
-  int total_system_len = BlockReduce(temp_storage).Reduce(max_system_len_this_thread, MaxOp<int>());
-  int total_dec_len_without_system = BlockReduce(temp_storage).Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
+  int total_max_len_this_time =
+      BlockReduce(temp_storage)
+          .Reduce(max_len_this_time_this_thread, MaxOp<int>());
+  int total_max_len_encoder =
+      BlockReduce(temp_storage)
+          .Reduce(max_len_encoder_this_thread, MaxOp<int>());
+  int total_max_len_decoder =
+      BlockReduce(temp_storage)
+          .Reduce(max_len_decoder_this_thread, MaxOp<int>());
+  int total =
+      BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
+  int total_just_dec = BlockReduce(temp_storage)
+                           .Reduce(max_just_dec_len_this_thread, MaxOp<int>());
+  int total_just_dec_merged =
+      BlockReduce(temp_storage)
+          .Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
+  int total_system_len = BlockReduce(temp_storage)
+                             .Reduce(max_system_len_this_thread, MaxOp<int>());
+  int total_dec_len_without_system =
+      BlockReduce(temp_storage)
+          .Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
  if (tid == 0) {
    max_lens[0] = total_max_len_this_time;
    max_lens[1] = total_max_len_encoder;
@@ -90,30 +105,22 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
  }
 }

-void GetMaxLen(const paddle::Tensor& seq_lens_tensor,
-              const paddle::Tensor& seq_lens_this_time,
-              const paddle::Tensor& seq_lens_encoder,
-              paddle::Tensor &max_len_tensor,
-              const int batch_size) {
+void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
+               const paddle::Tensor &seq_lens_this_time,
+               const paddle::Tensor &seq_lens_encoder,
+               paddle::Tensor &max_len_tensor, const int batch_size) {
  constexpr int blockSize = 1024;
  GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
-    seq_lens_tensor.data<int>(),
-    seq_lens_this_time.data<int>(),
-    seq_lens_encoder.data<int>(),
-    nullptr,
-    nullptr,
-    nullptr,
-    nullptr,
-    max_len_tensor.data<int>(),
-    batch_size);
+      seq_lens_tensor.data<int>(), seq_lens_this_time.data<int>(),
+      seq_lens_encoder.data<int>(), nullptr, nullptr, nullptr, nullptr,
+      max_len_tensor.data<int>(), batch_size);
 }

-__global__ void split_q_block(const int* __restrict__ seq_lens_q,
-                              const int* __restrict__ seq_lens_encoder,
-                              int* __restrict__ batch_ids,
-                              int* __restrict__ tile_ids_per_batch,
-                              int* __restrict__ num_blocks_x,
-                              const int bsz,
+__global__ void split_q_block(const int *__restrict__ seq_lens_q,
+                              const int *__restrict__ seq_lens_encoder,
+                              int *__restrict__ batch_ids,
+                              int *__restrict__ tile_ids_per_batch,
+                              int *__restrict__ num_blocks_x, const int bsz,
                              const int num_rows_per_block,
                              const int group_size) {
  if (threadIdx.x == 0) {
@@ -124,8 +131,7 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
      if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
        seq_len = 0;
      }
-      const int loop_times =
-          div_up(seq_len * group_size, num_rows_per_block);
+      const int loop_times = div_up(seq_len * group_size, num_rows_per_block);
      for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
        batch_ids[index] = bid;
        tile_ids_per_batch[index++] = tile_id;
@@ -136,14 +142,12 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
  }
 }

-__global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
-                               const int* __restrict__ seq_lens_encoder,
-                               int* __restrict__ batch_ids,
-                               int* __restrict__ tile_ids_per_batch,
-                               int* __restrict__ num_blocks_x,
-                               const int bsz,
-                               const int pad_len,
-                               const int num_row_per_block) {
+__global__ void split_kv_block(const int *__restrict__ seq_lens_decoder,
+                               const int *__restrict__ seq_lens_encoder,
+                               int *__restrict__ batch_ids,
+                               int *__restrict__ tile_ids_per_batch,
+                               int *__restrict__ num_blocks_x, const int bsz,
+                               const int pad_len, const int num_row_per_block) {
  if (threadIdx.x == 0) {
    int gridx = 0;
    int index = 0;
@@ -165,50 +169,46 @@ __global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
 }

 template <int THREADBLOCK_SIZE>
-__global__ void get_max_len_kv_ernel(int* max_seq_lens_out,
-                                  const int* seq_lens_this_time,
-                                  const int* seq_lens_decoder,
-                                  const int batch_size) {
+__global__ void
+get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
+                     const int *seq_lens_decoder, const int batch_size) {
  const int tid = threadIdx.x;

-
  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  int max_len_this_thread = 0;
  for (int i = tid; i < batch_size; i += blockDim.x) {
-    if (seq_lens_decoder[i] == 0) continue;
-    max_len_this_thread = max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
+    if (seq_lens_decoder[i] == 0)
+      continue;
+    max_len_this_thread =
+        max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
  }
-  int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
+  int total =
+      BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
  if (tid == 0) {
    *max_seq_lens_out = total;
  }
 }

 std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
-    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& cum_offsets,
-    const int encoder_block_shape_q,
-    const int decoder_block_shape_q,
-    const int group_size,
-    const int block_size,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::Tensor &seq_lens_decoder,
+    const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
+    const int encoder_block_shape_q, const int decoder_block_shape_q,
+    const int group_size, const int block_size,
    const int decoder_step_token_num) {
  auto stream = seq_lens_encoder.stream();
  int bsz = cum_offsets.shape()[0];
  auto max_len_tensor =
      GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
-  GetMaxLen(
-    seq_lens_decoder,
-    seq_lens_this_time,
-    seq_lens_encoder,
-    max_len_tensor,
-    bsz);
+  GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
+            max_len_tensor, bsz);

-  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time, max_enc_dec_len_this_time,
-  // max_just_dec_len_this_time, max_just_dec_merged_len_this_time, max_system_len, max_just_dec_len_without_system
+  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
+  // max_enc_dec_len_this_time, max_just_dec_len_this_time,
+  // max_just_dec_merged_len_this_time, max_system_len,
+  // max_just_dec_len_without_system
  auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
  auto max_len_cpu_ptr = max_len_cpu.data<int>();
  int max_len_this_time = max_len_cpu_ptr[0];
@@ -229,67 +229,67 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
  paddle::Tensor decoder_batch_ids;
  paddle::Tensor decoder_tile_ids_per_batch;
  paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor max_len_kv_cpu; /*cpu*/
+  paddle::Tensor max_len_kv_cpu;           /*cpu*/

  auto max_len_kv =
      GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
  get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
-    max_len_kv.data<int>(),
-    seq_lens_this_time.data<int>(),
-    seq_lens_decoder.data<int>(),
-    bsz
-  );
+      max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
+      seq_lens_decoder.data<int>(), bsz);

-  max_len_kv_cpu =
-      max_len_kv.copy_to(paddle::CPUPlace(), false);
+  max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);

  if (max_enc_len_this_time > 0) {
-    const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
-    kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
-                                      paddle::DataType::INT32,
-                                      seq_lens_encoder.place());
-    kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
-                                                paddle::DataType::INT32,
-                                                seq_lens_encoder.place());
+    const uint32_t max_tile_size_per_bs_kv =
+        div_up(max_enc_dec_len_this_time, block_size);
+    kv_batch_ids =
+        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
+                       seq_lens_encoder.place());
+    kv_tile_ids_per_batch =
+        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
+                       seq_lens_encoder.place());
    auto kv_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());

    split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
-      seq_lens_decoder.data<int>(),
-      // sequence_lengths->data<int>(),
-      seq_lens_encoder.data<int>(),
-      kv_batch_ids.data<int>(),
-      kv_tile_ids_per_batch.data<int>(),
-      kv_num_blocks_x.data<int>(),
-      bsz,
-      block_size,
-      block_size
-    );
+        seq_lens_decoder.data<int>(),
+        // sequence_lengths->data<int>(),
+        seq_lens_encoder.data<int>(), kv_batch_ids.data<int>(),
+        kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
+        block_size, block_size);

    kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);

-    const uint32_t encoder_max_tile_size_per_bs_q = div_up(
-        (max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
+    const uint32_t encoder_max_tile_size_per_bs_q =
+        div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
    encoder_batch_ids =
        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
-                      seq_lens_encoder.place());
+                       paddle::DataType::INT32, seq_lens_encoder.place());
    encoder_tile_ids_per_batch =
        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
-                      seq_lens_encoder.place());
+                       paddle::DataType::INT32, seq_lens_encoder.place());
    auto encoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
-    split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
-                                        nullptr,
+    split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
                                        encoder_batch_ids.data<int>(),
                                        encoder_tile_ids_per_batch.data<int>(),
-                                        encoder_num_blocks_x.data<int>(),
-                                        bsz,
-                                        encoder_block_shape_q,
-                                        group_size);
+                                        encoder_num_blocks_x.data<int>(), bsz,
+                                        encoder_block_shape_q, group_size);
    encoder_num_blocks_x_cpu =
        encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
+  } else {
+    encoder_batch_ids =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    encoder_tile_ids_per_batch =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    encoder_num_blocks_x_cpu =
+        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
+    kv_batch_ids =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    kv_tile_ids_per_batch =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    kv_num_blocks_x_cpu =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
  }
  if (max_just_dec_len_this_time > 0) {
    const uint32_t decoder_max_tile_size_per_bs_q =
@@ -297,24 +297,26 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(

    decoder_batch_ids =
        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
-                      seq_lens_encoder.place());
+                       paddle::DataType::INT32, seq_lens_encoder.place());
    decoder_tile_ids_per_batch =
        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
-                      seq_lens_encoder.place());
+                       paddle::DataType::INT32, seq_lens_encoder.place());
    auto decoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
-    split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
-                                        seq_lens_encoder.data<int>(),
-                                        decoder_batch_ids.data<int>(),
-                                        decoder_tile_ids_per_batch.data<int>(),
-                                        decoder_num_blocks_x.data<int>(),
-                                        bsz,
-                                        decoder_block_shape_q,
-                                        group_size);
+    split_q_block<<<1, 32, 0, stream>>>(
+        seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
+        decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
+        decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
+        group_size);
    decoder_num_blocks_x_cpu =
        decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
+  } else {
+    decoder_batch_ids =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    decoder_tile_ids_per_batch =
+        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
+    decoder_num_blocks_x_cpu =
+        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
  }

  return {encoder_batch_ids,
@@ -331,28 +333,22 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
 }

 std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
-    const paddle::DataType& seq_lens_encoder_dtype,
-    const paddle::DataType& seq_lens_decoder_dtype,
-    const paddle::DataType& seq_lens_this_time_dtype,
-    const paddle::DataType& cum_offsets_dtype) {
-  return {paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32,
-          paddle::DataType::INT32};
+    const paddle::DataType &seq_lens_encoder_dtype,
+    const paddle::DataType &seq_lens_decoder_dtype,
+    const paddle::DataType &seq_lens_this_time_dtype,
+    const paddle::DataType &cum_offsets_dtype) {
+  return {
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32};
 }

 std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
-    const std::vector<int64_t>& seq_lens_encoder_shape,
-    const std::vector<int64_t>& seq_lens_decoder_shape,
-    const std::vector<int64_t>& seq_lens_this_time_shape,
-    const std::vector<int64_t>& cum_offsets_shape) {
+    const std::vector<int64_t> &seq_lens_encoder_shape,
+    const std::vector<int64_t> &seq_lens_decoder_shape,
+    const std::vector<int64_t> &seq_lens_this_time_shape,
+    const std::vector<int64_t> &cum_offsets_shape) {
  std::vector<int64_t> dynamic_shape = {-1};

  return {dynamic_shape,
@@ -369,9 +365,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
 }

 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
-    .Inputs({"seq_lens_encoder",
-             "seq_lens_decoder",
-             "seq_lens_this_time",
+    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
             "cum_offsets"})
    .Outputs({paddle::Optional("encoder_batch_ids"),
              paddle::Optional("encoder_tile_ids_per_batch"),
@@ -382,12 +376,9 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
              paddle::Optional("decoder_batch_ids"),
              paddle::Optional("decoder_tile_ids_per_batch"),
              paddle::Optional("decoder_num_blocks"),
-              paddle::Optional("max_len_kv"),
-              "set_max_lengths"})
-    .Attrs({"encoder_block_shape_q: int",
-            "decoder_block_shape_q: int",
-            "group_size: int",
-            "block_size: int",
+              paddle::Optional("max_len_kv"), "set_max_lengths"})
+    .Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
+            "group_size: int", "block_size: int",
            "decoder_step_token_num: int"})
    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
    .SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
--- a/custom_ops/gpu_ops/append_attn/utils.cuh
+++ b/custom_ops/gpu_ops/append_attn/utils.cuh
@@ -337,6 +337,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (deal_each_time == 64) {                                 \
    constexpr size_t DEAL_EACH_TIME = 64;                            \
    __VA_ARGS__                                                      \
+  } else {                                                           \
+    PD_THROW("not support the deal_each_time", deal_each_time);      \
  }

 #define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
@@ -346,6 +348,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (num_threads == 256) {                          \
    constexpr size_t NUM_THREADS = 256;                     \
    __VA_ARGS__                                             \
+   } else {                                                 \
+    PD_THROW("not support the num_threads", num_threads);   \
  }

 #define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
@@ -376,6 +380,11 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (group_size == 12) {                             \
    constexpr size_t GROUP_SIZE = 12;                        \
    __VA_ARGS__                                              \
+  } else if (group_size == 16) {                             \
+    constexpr size_t GROUP_SIZE = 16;                        \
+    __VA_ARGS__                                              \
+  } else {                                                   \
+    PD_THROW("not support the group_size", group_size);      \
  }

 #define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include "paddle/extension.h"
-
+#include "pybind11/pybind11.h"
 namespace py = pybind11;

 // 自定义异常类，用于处理CUDA错误
@@ -125,45 +125,40 @@ paddle::Tensor FusedExpertMoeFunc(
    const bool norm_topk_prob, const bool group_moe);

 std::vector<paddle::Tensor> MoeExpertDispatch(
-    const paddle::Tensor& input,
-    const paddle::Tensor& gating_output,
-    const paddle::optional<paddle::Tensor>& gating_correction_bias,
-    const paddle::optional<paddle::Tensor> &w4a8_in_scale,
-    const int moe_topk,
-    const bool group_moe,
-    const bool topk_only_mode);
+    const paddle::Tensor &input, const paddle::Tensor &gating_output,
+    const paddle::optional<paddle::Tensor> &gating_correction_bias,
+    const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
+    const bool group_moe, const bool topk_only_mode);

 std::vector<paddle::Tensor>
 MoETopKSelectKernel(const paddle::Tensor &gating_logits,
-                  const paddle::optional<paddle::Tensor> &bias,
-                  const int moe_topk, const bool apply_norm_weight,
-                  const bool enable_softmax_top_k_fused);
+                    const paddle::optional<paddle::Tensor> &bias,
+                    const int moe_topk, const bool apply_norm_weight,
+                    const bool enable_softmax_top_k_fused);

-std::vector<paddle::Tensor> MoERedundantTopKSelectKernel(
-    const paddle::Tensor& gating_logits,
-    const paddle::Tensor& expert_id_to_ep_rank_array,
-    const paddle::Tensor& expert_in_rank_num_list,
-    paddle::Tensor& tokens_per_expert_stats_list,
-    const paddle::optional<paddle::Tensor>& bias,
-    const int moe_topk,
-    const bool apply_norm_weight,
-    const bool enable_softmax_top_k_fused,
-    const int redundant_ep_rank_num_plus_one);
+std::vector<paddle::Tensor>
+MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
+                             const paddle::Tensor &expert_id_to_ep_rank_array,
+                             const paddle::Tensor &expert_in_rank_num_list,
+                             paddle::Tensor &tokens_per_expert_stats_list,
+                             const paddle::optional<paddle::Tensor> &bias,
+                             const int moe_topk, const bool apply_norm_weight,
+                             const bool enable_softmax_top_k_fused,
+                             const int redundant_ep_rank_num_plus_one);

 std::vector<paddle::Tensor>
 EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
-                  const paddle::Tensor &topk_weights,
-                  const paddle::optional<paddle::Tensor> &ffn1_in_scale,
-                  const std::vector<int> &token_nums_per_expert,
-                  const int token_nums_this_rank,
-                  const std::string &moe_quant_type);
+                    const paddle::Tensor &topk_weights,
+                    const paddle::optional<paddle::Tensor> &ffn1_in_scale,
+                    const std::vector<int> &token_nums_per_expert,
+                    const int token_nums_this_rank,
+                    const std::string &moe_quant_type);

 std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
    const paddle::Tensor &input, const paddle::Tensor &scale,
    const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
-    const std::vector<int> &token_nums_per_expert,
-    const std::vector<int> &token_nums_per_expert_padded,
-    const int token_nums_this_rank, const int token_nums_this_rank_padded);
+    const paddle::Tensor &token_nums_per_expert,
+    const paddle::Tensor &token_nums_per_expert_padded);

 std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
                                          const int block_size);
@@ -180,20 +175,35 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
    const paddle::optional<paddle::Tensor> &ffn2_bias,
    const bool norm_topk_prob, const float routed_scaling_factor);

-std::vector<std::vector<int>> GetExpertTokenNum(
-    const paddle::Tensor& topk_ids,
-    const int num_experts);
+std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
+                                                const int num_experts);

 paddle::Tensor MoeExpertFFNFunc(
-    const paddle::Tensor &permute_input,
-    const paddle::Tensor &tokens_expert_prefix_sum,
-    const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
-    const paddle::optional<paddle::Tensor> &ffn1_bias,
-    const paddle::optional<paddle::Tensor> &ffn1_scale,
-    const paddle::optional<paddle::Tensor> &ffn2_scale,
-    const paddle::optional<paddle::Tensor> &ffn2_in_scale,
-    const paddle::optional<paddle::Tensor> &expert_idx_per_token,
-    const std::string &quant_method, const bool used_in_ep_low_latency);
+    const paddle::Tensor& permute_input,
+    const paddle::Tensor& tokens_expert_prefix_sum,
+    const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
+    const paddle::optional<paddle::Tensor>& ffn1_bias,
+    const paddle::optional<paddle::Tensor>& ffn1_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_in_scale,
+    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
+    const std::string& quant_method, const bool used_in_ep_low_latency);
+
+paddle::Tensor MoeExpertFFNWint2Func(
+    const paddle::Tensor& permute_input,
+    const paddle::Tensor& tokens_expert_prefix_sum,
+    const paddle::Tensor& ffn1_weight,
+    const paddle::Tensor& ffn2_weight,
+    const paddle::optional<paddle::Tensor>& ffn1_bias,
+    const paddle::optional<paddle::Tensor>& ffn1_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_scale,
+    const paddle::optional<paddle::Tensor>& ffn1_local_scale,
+    const paddle::optional<paddle::Tensor>& ffn1_code_scale,
+    const paddle::optional<paddle::Tensor>& ffn1_code_zp,
+    const paddle::optional<paddle::Tensor>& ffn2_local_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_code_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_code_zp,
+    const bool used_in_ep_low_latency);

 paddle::Tensor MoeExpertReduceFunc(
    const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
@@ -205,19 +215,16 @@ paddle::Tensor MoeExpertReduceFunc(
 void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
                          const paddle::Tensor &seq_lens_this_time_tensor,
                          const paddle::Tensor &seq_lens_decoder_tensor,
-                          const int rank,
-                          const int num_layers);
+                          const int rank, const int num_layers);

-void GetOutputKVSignal(const paddle::Tensor& x,
-                       int64_t rank_id,
+void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id,
                       bool wait_flag);

-
 paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
                               const paddle::Tensor &out_scale,
                               std::string dtype);

-paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
+paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
                                           const bool keep_pd_step_flag);

 paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
@@ -286,61 +293,121 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
    const paddle::Tensor &seq_lens_this_time,
    const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);

-std::vector<paddle::Tensor> MoEDeepGEMMPermute(
-    const paddle::Tensor& x,
-    const paddle::Tensor& topk_idx,
-    const int num_experts,
-    const int max_tokens_per_expert
-);
+std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
+                                               const paddle::Tensor &topk_idx,
+                                               const int num_experts,
+                                               const int max_tokens_per_expert);

 std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
-    const paddle::Tensor& ffn_out, // [num_experts, max_tokens_per_expert, hidden]
-    const paddle::Tensor& permute_indices_per_token, // [token_num, topk}]
-    const paddle::Tensor& topk_idx,
-    const paddle::Tensor& topk_weights
-);
+    const paddle::Tensor
+        &ffn_out, // [num_experts, max_tokens_per_expert, hidden]
+    const paddle::Tensor &permute_indices_per_token, // [token_num, topk}]
+    const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
+
+void TextImageIndexOut(const paddle::Tensor &token_type_ids,
+                       const paddle::Tensor &text_input,
+                       const paddle::Tensor &image_input);
+
+void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
+                            paddle::Tensor &image_input,
+                            paddle::Tensor &token_type_ids,
+                            paddle::Tensor &text_index,
+                            paddle::Tensor &image_index, const bool is_scatter);
+
+paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
+                                            int64_t num_experts);


+std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M);
+
+
+std::vector<paddle::Tensor> MoeWna16MarlinGemmApi(
+    const paddle::Tensor& a,
+    const paddle::optional<paddle::Tensor>& c_or_none,
+    const paddle::Tensor& b_q_weight,
+    const paddle::Tensor& b_scales,
+    const paddle::optional<paddle::Tensor>& global_scale_or_none,
+    const paddle::optional<paddle::Tensor>& b_zeros_or_none,
+    const paddle::optional<paddle::Tensor>& g_idx_or_none,
+    const paddle::optional<paddle::Tensor>& perm_or_none,
+    const paddle::Tensor& workspace,
+    const paddle::Tensor& sorted_token_ids,
+    const paddle::Tensor& expert_ids,
+    const paddle::Tensor& num_tokens_post_padded,
+    const paddle::Tensor& topk_weights,
+    int64_t moe_block_size,
+    int64_t top_k,
+    bool mul_topk_weights,
+    bool is_ep,
+    const std::string& b_q_type_str,
+    int64_t size_m,
+    int64_t size_n,
+    int64_t size_k,
+    bool is_k_full,
+    bool use_atomic_add,
+    bool use_fp32_reduce,
+    bool is_zp_float);
+void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a,
+                     paddle::Tensor const &b, paddle::Tensor const &a_scales,
+                     paddle::Tensor const &b_scales,
+                     paddle::optional<paddle::Tensor> const &bias);
+
+void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a,
+                           paddle::Tensor const& b,
+                           paddle::Tensor const& a_scales,
+                           paddle::Tensor const& b_scales,
+                           paddle::Tensor const& azp_adj,
+                           paddle::optional<paddle::Tensor> const& azp,
+                           paddle::optional<paddle::Tensor> const& bias);
+
+void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
+                          paddle::Tensor const &scale);
+
+void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
+                           paddle::Tensor &scale);
+
+void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out,
+                                   paddle::Tensor const &input,
+                                   paddle::Tensor &scales, float scale_ub);

 PYBIND11_MODULE(fastdeploy_ops, m) {

-      m.def("get_expert_token_num", &GetExpertTokenNum,
-            py::arg("topk_ids"), py::arg("num_experts"),
-            "get expert token num");
+  m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
+        py::arg("num_experts"), "get expert token num");

+  /**
+   * moe/fused_moe/moe_redundant_topk_select.cu
+   * moe_redundant_topk_select
+   */
+  m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
+        py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
+        py::arg("expert_in_rank_num_list"),
+        py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
+        py::arg("moe_topk"), py::arg("apply_norm_weight"),
+        py::arg("enable_softmax_top_k_fused"),
+        py::arg("redundant_ep_rank_num_plus_one"),
+        "moe export RedundantTopKSelect function");

-      /**
-      * moe/fused_moe/moe_redundant_topk_select.cu
-      * moe_redundant_topk_select
-      */
-      m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
-            py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
-            py::arg("expert_in_rank_num_list"), py::arg("tokens_per_expert_stats_list"),
-            py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"),
-            py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"),
-            "moe export RedundantTopKSelect function");
+  /**
+   * open_shm_and_get_meta_signal.cc
+   * InitKVSignalPerQuery
+   */
+  m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
+        py::arg("seq_lens_encoder_tensor"),
+        py::arg("seq_lens_this_time_tensor"),
+        py::arg("seq_lens_decoder_tensor"), py::arg("rank"),
+        py::arg("num_layers"), "init_kv_signal_per_query function");

+  /**
+   * GetOutputKVSignal
+   */
+  m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"),
+        py::arg("rank_id"), py::arg("wait_flag"),
+        "get_output_kv_signal function");

-      /**
-      * open_shm_and_get_meta_signal.cc
-      * InitKVSingnalPerQuery
-      */
-      m.def("init_kv_signal_per_query", &InitKVSignalPerQuery, 
-            py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"),
-            py::arg("seq_lens_decoder_tensor"), py::arg("rank"), py::arg("num_layers"),
-            "init_kv_signal_per_query function");
-            
-      /**
-      * GetOutputKVSignal
-      */
-      m.def("get_output_kv_signal", &GetOutputKVSignal, 
-            py::arg("x"), py::arg("rank_id"), py::arg("wait_flag"),
-            "get_output_kv_signal function");
-
-
-
-      m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
-      m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
+  m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
+  m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute,
+        "MoEDeepGEMMDePermute");
  /**
   * alloc_cache_pinned.cc
   * cuda_host_alloc
@@ -398,12 +465,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
        py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
        py::arg("moe_quant_type"), "ep moe export dispatch function");

-  m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8, py::arg("input"),
-        py::arg("scale"), py::arg("topk_ids"), py::arg("topk_weights"),
-        py::arg("token_nums_per_expert"),
-        py::arg("token_nums_per_expert_padded"),
-        py::arg("token_nums_this_rank"), py::arg("token_nums_this_rank_padded"),
-        "ep moe export dispatch function");
+  m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8);

  m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
        py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
@@ -437,6 +499,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   */
  m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");

+  /**
+   * moe/fused_moe/moe_ffn_wint2.cu
+   * moe_expert_ffn_wint2
+   */
+  m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
+
  /**
   * moe/fused_moe/moe_expert_reduce.cu
   * moe_expert_reduce
@@ -523,4 +591,66 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
        "group_swiglu_with_masked function");
+
+  m.def("text_image_index_out", &TextImageIndexOut,
+        "text_image_index_out function");
+
+  m.def("text_image_gather_scatter", &TextImageGatherScatter,
+        "text_image_gather_scatter function");
+
+  m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
+  m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
+
+  m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
+  py::arg("a"),
+  py::arg("c_or_none"),
+  py::arg("b_q_weight"),
+  py::arg("b_scales"),
+  py::arg("global_scale_or_none"),
+  py::arg("b_zeros_or_none"),
+  py::arg("g_idx_or_none"),
+  py::arg("perm_or_none"),
+  py::arg("workspace"),
+  py::arg("sorted_token_ids"),
+  py::arg("expert_ids"),
+    py::arg("num_tokens_post_padded"),
+  py::arg("topk_weights"),
+  py::arg("moe_block_size"),
+    py::arg("top_k"),
+      py::arg("mul_topk_weights"),
+        py::arg("is_ep"),
+          py::arg("b_q_type_str"),
+            py::arg("size_m"),
+              py::arg("size_n"),
+              py::arg("size_k"),
+              py::arg("is_k_full"),
+              py::arg("use_atomic_add"),
+              py::arg("use_fp32_reduce"),
+              py::arg("is_zp_float"));
+
+
+  /**
+   * cutlass_scaled_mm.cu
+   * cutlass_scaled_mm
+   * cutlass_scaled_mm_azp
+   */
+  m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function");
+  m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function");
+
+  /**
+   * quantization/common.cu
+   * static_scaled_fp8_quant
+   * dynamic_scaled_fp8_quant
+   * dynamic_per_token_scaled_fp8_quant
+   */
+  m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function",
+      py::arg("out"), py::arg("input"), py::arg("scale"));
+
+  m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant,
+        "dynamic_scaled_fp8_quant function",
+        py::arg("out"), py::arg("input"), py::arg("scale"));
+
+  m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
+        "dynamic_per_token_scaled_fp8_quant function",
+         py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
 }
--- a/custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
@@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Architecture-specific operators on memory added for SM80
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/arch/cache_operation.h"
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Initiates an asynchronous copy from global memory to shared memory.
+///
+/// cp.async
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always,
+    bool GlobalToShared = true>
+struct copy;
+
+/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
+/// the entire transfer, zeros are written to SMEM if the guard predicate is false.
+///
+/// cp.async
+///
+template <
+    /// Size of the access in bytes
+    int SizeInBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always,
+    bool GlobalToShared = true>
+struct copy_zfill;
+
+/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
+///
+/// cp.async
+///
+template <int N, bool GlobalToShared = true>
+struct copy_wait;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy<SizeInBytes, CacheOperation::Always, true> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      cp_async<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy<SizeInBytes, CacheOperation::Always, false> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy_zfill<SizeInBytes, CacheOperation::Always, true> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+      cp_async_zfill<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy_zfill<SizeInBytes, CacheOperation::Always, false> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy<SizeInBytes, CacheOperation::Global, true> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      cp_async<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy<SizeInBytes, CacheOperation::Global, false> {
+
+  /// Copy
+  CUTLASS_DEVICE
+  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy_zfill<SizeInBytes, CacheOperation::Global, true> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      cp_async_zfill<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
+  }
+};
+
+/// Partial specialization
+template <
+    /// Size of the access in bytes
+    int SizeInBytes>
+struct copy_zfill<SizeInBytes, CacheOperation::Global, false> {
+
+  /// Copy with zero fill
+  CUTLASS_DEVICE
+  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
+      using AccessType  = Array<uint8_t, SizeInBytes>;
+
+      if (pred_guard) {
+        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
+      }
+      else {
+        AccessType zeros;
+        zeros.clear();
+        *static_cast<AccessType *>(smem_ptr) = zeros;
+      }
+  }
+};
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+template <bool GlobalToShared>
+CUTLASS_DEVICE
+void copy_fence() {}
+
+template <>
+CUTLASS_DEVICE
+void copy_fence<true>() {
+  cp_async_fence();
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization
+template <int N>
+struct copy_wait<N, false> {
+
+  CUTLASS_DEVICE
+  copy_wait() {}
+};
+
+/// Partial specialization
+template <int N>
+struct copy_wait<N, true> {
+
+  CUTLASS_DEVICE
+  copy_wait() { cp_async_wait<N>(); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace arch
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/Show More
+++ b/Show More