Sync v2.0 version of code to github repo

2025-10-04 16:22:57 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,29 @@
 # This file is used by clang-format to autoformat paddle source code
 #
 # The clang-format is part of llvm toolchain.
 # It need to install llvm and clang to format source code style.
 #
 # The basic usage is,
 #   clang-format -i -style=file PATH/TO/SOURCE/CODE
 #
 # The -style=file implicit use ".clang-format" file located in one of
 # parent directory.
 # The -i means inplace change.
 #
 # The document of clang-format is
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
 ---
 Language:        Cpp
 BasedOnStyle:  Google
 IndentWidth:     4
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -1  # The private/protected/public has no indent in class
 Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 IncludeBlocks: Preserve
 IncludeIsMainSourceRegex: (\.cu)$
 ...
--- a/.gitignore
+++ b/.gitignore
@@ -121,7 +121,7 @@ dmypy.json
 FETCH_HEAD
 #log
-log/
+log*/
 checkpoints/
 checkpoints_origin/
@@ -158,3 +158,7 @@ custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
 # buff
 custom_ops/tmp*
 build
 .ccls-cache
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,7 +16,7 @@ repos:
  rev: v0.11.7
  hooks:
  - id: ruff
-    args: [--output-format, github, --fix]
+    args: [--output-format, github, --fix, --line-length=120]
 # # 拼写检查
 # - repo: https://github.com/codespell-project/codespell
 #   rev: v2.4.1
@@ -29,14 +29,15 @@ repos:
  rev: 6.0.1
  hooks:
  - id: isort
-# 格式化
+# # 格式化
- repo: https://github.com/pre-commit/mirrors-clang-format
+# - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+#   rev: v20.1.3
-  hooks:
+#   hooks:
-  - id: clang-format
+#   - id: clang-format
-    # exclude: '.*'
+#     # exclude: '.*'
-    types_or: [c++, cuda]
+#     types_or: [c++, cuda]
-    args: [--style=file, --verbose]
+#     args: [--style=file, --verbose]
 # markdown
 - repo: https://github.com/jackdewinter/pymarkdown
  rev: v0.9.29
--- a/README.md
+++ b/README.md
@@ -1,9 +1,8 @@
 # FastDeploy 2.0: 大模型推理部署
 <p align="center">
-    <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
+  <a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
-    <a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
+</p>
-    <a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
+<p align="center">
    <a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
@@ -11,105 +10,78 @@
    <a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
 </p>
-FastDeploy升级2.0版本支持多种大模型推理（当前仅支持Qwen2，更多模型即将更新支持)，其推理部署功能涵盖：
+<p align="center">
    <a href="docs/get_started/installation/README.md"><b> Installation </b></a>
    |
    <a href="docs/get_started.md"><b> Quick Start </b></a>
    |
    <a href="docs/supported_models.md"><b> Supported Models </b></a>
 </p>
- 一行命令即可快速实现模型的服务化部署，并支持流式生成
+--------------------------------------------------------------------------------
- 利用张量并行技术加速模型推理
+# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
 - 支持 PagedAttention 与 continuous batching（动态批处理）
 - 兼容 OpenAI 的 HTTP 协议
 - 提供 Weight only int8/int4 无损压缩方案
 - 支持 Prometheus Metrics 指标
-> 注意: 如果你还在使用FastDeploy部署小模型(如PaddleClas/PaddleOCR等CV套件模型)，请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
+## News
-## 环境依赖
+**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
 - A800/H800/H100
 - Python>=3.10
 - CUDA>=12.3
 - CUDNN>=9.5
 - Linux X64
-## 安装
+## About
-### Docker安装(推荐)
+**FastDeploy** is an inference and deployment toolkit for large language models and visual language models based on PaddlePaddle. It delivers **production-ready, out-of-the-box deployment solutions** with core acceleration technologies:
 ```
 docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy:2.0.0.0-alpha
 ```
-### 源码安装
+- 🚀 **Load-Balanced PD Disaggregation**: Industrial-grade solution featuring context caching and dynamic instance role switching. Optimizes resource utilization while balancing SLO compliance and throughput.
-#### 安装PaddlePaddle
+- 🔄 **Unified KV Cache Transmission**: Lightweight high-performance transport library with intelligent NVLink/RDMA selection.
-> 注意安装nightly build版本，代码版本需新于2025.05.30，详见[PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)，指定安装CUDA 12.6 develop(Nightly build)版本。
+- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
-```
+- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
-python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+- ⏩ **Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
-```
+- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
-#### 编译安装FastDeploy
+## Requirements
-```
+- OS: Linux
-# 编译
+- Python: 3.10 ~ 3.12
 cd FastDeploy
 bash build.sh
 # 安装
 pip install dist/fastdeploy-2.0.0a0-py3-none-any.whl
 ```
-## 快速使用
+## Installation
-在安装后，执行如下命令快速部署Qwen2模型, 更多参数的配置与含义参考[参数说明](docs/serving.md).
+FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
-``` shell
+- [NVIDIA GPU](./docs/installation/nvidia_cuda.md)
-# 下载与解压Qwen模型
+- [Kunlunxin XPU](./docs/en/get_started/installation/kunlunxin_xpu.md)
-wget https://fastdeploy.bj.bcebos.com/llm/models/Qwen2-7B-Instruct.tar.gz && tar xvf Qwen2-7B-Instruct.tar.gz
+- [Iluvatar GPU](./docs/en/get_started/installation/iluvatar_gpu.md)
-# 指定单卡部署
+- [Enflame GCU](./docs/en/get_started/installation/Enflame_gcu.md)
 python -m fastdeploy.entrypoints.openai.api_server --model ./Qwen2-7B-Instruct --port 8188 --tensor-parallel-size 1
 ```
-使用如下命令请求模型服务
+**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
 ``` shell
 curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
  "messages": [
    {"role": "user", "content": "你好，你的名字是什么？"}
  ]
 }'
 ```
 响应结果如下所示
 ``` json
 {
    "id": "chatcmpl-db662f47-7c8c-4945-9a7a-db563b2ddd8d",
    "object": "chat.completion",
    "created": 1749451045,
    "model": "default",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "你好！我叫通义千问。",
                "reasoning_content": null
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 25,
        "total_tokens": 35,
        "completion_tokens": 10,
        "prompt_tokens_details": null
    }
 }
 ```
 FastDeploy提供与OpenAI完全兼容的服务API(字段`model`与`api_key`目前不支持，设定会被忽略)，用户也可基于openai python api请求服务。
-## 部署文档
+## Get Started
 - [本地部署](docs/offline_inference.md)
 - [服务部署](docs/serving.md)
 - [服务metrics](docs/metrics.md)
-# 代码说明
+Learn how to use FastDeploy through our documentation:
- [代码目录说明](docs/code_guide.md)
+- [10-Minutes Quick Deployment](./docs/get_started/quick_start.md)
- FastDeploy的使用中存在任何建议和问题，欢迎通过issue反馈。
+- [ERNIE-4.5 Large Language Model Deployment](./docs/get_started/ernie-4.5.md)
 - [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
 - [Offline Inference Development](./docs/offline_inference.md)
 - [Online Service Deployment](./docs/serving/README.md)
 - [Full Supported Models List](./docs/supported_models.md)
-# 开源说明
+## Supported Models
-FastDeploy遵循[Apache-2.0开源协议](./LICENSE)。 在本项目的开发中，为了对齐[vLLM](https://github.com/vllm-project/vllm)使用接口，参考和直接使用了部分vLLM代码，在此表示感谢。
+
 | Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching |  MTP | CUDA Graph | Maximum Context Length |
 |:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
 |ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅（WINT4/W4A8C8/Expert Parallelism)| ✅ | ✅|✅(WINT4)| WIP |128K |
 |ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅（WINT4/Expert Parallelism)| ✅ | ✅|✅(WINT4)| ❌ | 128K |
 |ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
 |ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
 |ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
 |ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8  |  ❌ |  ✅ |  ✅ | WIP | ✅|128K |
 |ERNIE-4.5-0.3B | BF16/WINT8/FP8  |  ❌ |  ✅ |  ✅ | ❌ | ✅| 128K |
 ## Advanced Usage
 - [Quantization](./docs/quantization/README.md)
 - [PD Disaggregation Deployment](./docs/features/pd_disaggregation.md)
 - [Speculative Decoding](./docs/features/speculative_decoding.md)
 - [Prefix Caching](./docs/features/prefix_caching.md)
 - [Chunked Prefill](./docs/features/chunked_prefill.md)
 ## Acknowledgement
 FastDeploy is licensed under the [Apache-2.0 open-source license](./LICENSE). During development, portions of [vLLM](https://github.com/vllm-project/vllm) code were referenced and incorporated to maintain interface compatibility, for which we express our gratitude.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,106 @@
 ### FastDeploy服务化性能压测工具
 #### 数据集：
 wget下载到本地用于性能测试
 <table style="width:100%; border-collapse: collapse;">
  <thead>
    <tr>
      <th style="width:15%; text-align: left;">Dataset</th>
      <th style="width:65%; text-align: left;">Data Path</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><strong>开源数据集 2k条</strong></td>
      <td><code>https://fastdeploy.bj.bcebos.com/eb_query/filtered_sharedgpt_2000_input_1136_output_200_fd.json</code></td>
    </tr>
  </tbody>
 </table>
 #### 使用方式：
 ```
 # 安装依赖
 python -m pip install -r requirements.txt
 ```
 ##### 参数说明
 ```bash
 --backend openai-chat：压测使用的后端接口，指定为"openai-chat"使用chat/completion接口
 --model EB45T：模型名，任意取名，影响最后保存的结果文件名 EB45T \
 --endpoint /v1/chat/completions：endpoint，用于组url
 --host 0.0.0.0：服务ip地址，用于组url
 --port 9812：服务HTTP端口，用于组url
 --dataset-name EBChat：指定数据集类，指定为"EBChat"可读取转存的FD格式数据集
 --dataset-path ./eb45t_spv4_dataserver_1w_waigua_fd：压测数据集路径
 --hyperparameter-path EB45T.yaml：(可选)超参文件，请求时会更新进payload中，默认不带任何超参
 --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len：性能结果中展示的指标集合
 --metric-percentiles 80,95,99,99.9,99.95,99.99：性能结果中展示的性能指标分位值
 --num-prompts 1：总计发送多少条请求
 --max-concurrency 1：压测并发数
 --save-result：开启结果保存，结果文件会存入json
 ```
 ##### /v1/chat/completions接口压测单条数据调试
 ```
 python benchmark_serving.py \
  --backend openai-chat \
  --model EB45T \
  --endpoint /v1/chat/completions \
  --host 0.0.0.0 \
  --port 9812 \
  --dataset-name EBChat \
  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
  --num-prompts 1 \
  --max-concurrency 1 \
  --save-result
 ```
 ##### /v1/chat/completions接口完整100并发 2000条压测
 ```
 # 保存infer_log.txt
 python benchmark_serving.py \
  --backend openai-chat \
  --model EB45T \
  --endpoint /v1/chat/completions \
  --host 0.0.0.0 \
  --port 9812 \
  --dataset-name EBChat \
  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
  --num-prompts 2000 \
  --max-concurrency 100 \
  --save-result > infer_log.txt 2>&1 &
 ```
 ##### /v1/completions接口压测
 修改endpoint为/v1/completions，backend为openai，会对/v1/completions接口进行压测
 ```
 # 保存infer_log.txt
 python benchmark_serving.py \
  --backend openai \
  --model EB45T \
  --endpoint /v1/completions \
  --host 0.0.0.0 \
  --port 9812 \
  --dataset-name EBChat \
  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
  --hyperparameter-path yaml/request_yaml/eb45t-32k.yaml \
  --percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
  --metric-percentiles 80,95,99,99.9,99.95,99.99 \
  --num-prompts 2000 \
  --max-concurrency 100 \
  --save-result > infer_log.txt 2>&1 &
 ```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -0,0 +1,700 @@
 """
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 # This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
 import io
 import json
 import os
 import sys
 import time
 import traceback
 from dataclasses import dataclass, field
 from typing import Optional
 import aiohttp
 from tqdm.asyncio import tqdm
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
 class RequestFuncInput:
    """Input for requesting LLMs via API"""
    prompt: str
    history_QA: Optional[dict]
    hyper_parameters: dict
    api_url: str
    prompt_len: int
    output_len: int
    model: str
    model_name: Optional[str] = None
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
    language: Optional[str] = None
@dataclass
 class RequestFuncOutput:
    """Output for requesting LLMs via API"""
    generated_text: str = ""
    reasoning_content: str = ""
    success: bool = False
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
    arrival_time: list = field(default_factory=list)  # arrival_time
    itl: list = field(default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    prompt_tokens: int = 0 # 推理侧返回输入token数
    error: str = ""
 async def async_request_eb_openai_chat_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using EB OpenAI"""
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("completions", "profile")
    ), "OpenAI Chat Completions API URL must end with 'completions'."
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
            "model": "default",
            "messages": request_func_input.history_QA,
            "stream": True,
            "stream_options": {
                "include_usage": True,
                "continuous_usage_stats": True
            },
        }
        # 超参由yaml传入
        payload.update(request_func_input.hyper_parameters)
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
        output = RequestFuncOutput()
        output.prompt_len = 0
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, type(chunk))
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
                            if choices := data.get("choices"):
                                content = choices[0]["delta"].get("content")
                                reason_content = choices[0]["delta"].get("reasoning_content")
                                # First token
                                if ttft == 0.0:
                                    ttft = timestamp - st
                                    output.ttft = ttft
                                    # cached_tokens
                                    output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                output.generated_text += content or ""
                                output.reasoning_content += reason_content or ""
                                output.arrival_time.append(choices[0].get("arrival_time"))
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get(
                                    "completion_tokens")
                                output.prompt_tokens = usage.get(
                                    "prompt_tokens")
                            most_recent_timestamp = timestamp
                    # output.generated_text = generated_text
                    if output.generated_text.strip() == "":
                        output.success = False
                        output.error = "No generated text found!"
                    else:
                        output.success = True
                    output.latency = most_recent_timestamp - st
                else:
                    error_text = await response.text()
                    print("####error response:", error_text, "####payload:", payload)
                    output.error = error_text or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        # 保存失败请求结果
        if not output.success:
            with open("error_output.txt", "a") as f:
                f.write(str(output) + "\n")
    if pbar:
        pbar.update(1)
    return output
 async def async_request_eb_openai_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using EB OpenAI"""
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "model": "default",
            "prompt": request_func_input.prompt,
            "stream": True,
            "stream_options": {
                "include_usage": True,
                "continuous_usage_stats": True
            },
        }
        # 超参由yaml传入
        payload.update(request_func_input.hyper_parameters)
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        generated_text = ""
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, chunk.usage)
                            data = json.loads(chunk)
                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
                            if choices := data.get("choices"):
                                # Note that text could be empty here
                                # e.g. for special tokens
                                text = choices[0].get("text")
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft
                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                most_recent_timestamp = timestamp
                                output.arrival_time.append(choices[0].get("arrival_time"))
                                generated_text += text or ""
                            elif usage := data.get("usage"):
                                output.prompt_tokens = usage.get(
                                    "prompt_tokens")
                                output.output_tokens = usage.get(
                                    "completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
                            "This response will be marked as failed!")
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
    if pbar:
        pbar.update(1)
    return output
 async def async_request_tgi(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using the TGI API"""
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        params = {
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
            "truncate": request_func_input.prompt_len,
            "ignore_eos_token": request_func_input.ignore_eos,
        }
        payload = {
            "inputs": request_func_input.prompt,
            "parameters": params,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        if request_func_input.ignore_eos:
            output.output_tokens = request_func_input.output_len
        else:
            output.output_tokens = None
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk_bytes = chunk_bytes.decode("utf-8")
                        # NOTE: Sometimes TGI returns a ping response without
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
                        chunk = chunk_bytes.removeprefix("data:")
                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
                        # Decoding phase
                        else:
                            output.itl.append(timestamp -
                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
                        output.arrival_time.append(data["arrival_time"])
                    output.latency = most_recent_timestamp - st
                    output.success = True
                    output.generated_text = data["generated_text"]
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
        return output
 async def async_request_trt_llm(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using TRT's llm_server"""
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
            "temperature": 0.0,
            "top_p": 1.0,
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
        if request_func_input.ignore_eos:
            payload["min_length"] = request_func_input.output_len
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data:")
                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
                            ttft = timestamp - st
                            output.ttft = ttft
                        # Decoding phase
                        else:
                            output.itl.append(timestamp -
                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
                    output.latency = most_recent_timestamp - st
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
        return output
 async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using Deepspeed MII"""
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
            "top_p": 1.0,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
        # will use 0 as placeholder.
        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
        output.ttft = 0
        st = time.perf_counter()
        try:
            async with session.post(url=request_func_input.api_url,
                                    json=payload) as response:
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
                    if "choices" in parsed_resp:
                        output.generated_text = parsed_resp["choices"][0][
                            "text"]
                    elif "text" in parsed_resp:
                        output.generated_text = parsed_resp["text"][0]
                    else:
                        output.error = ("Unexpected response format: "
                                        "neither 'choices' nor 'text' found")
                        output.success = False
                    output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
        return output
 async def async_request_openai_completions(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using OpenAI"""
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            # "temperature": 0.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
            #"stream_options": {
            #    "include_usage": True,
            #},
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
        generated_text = ""
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, type(chunk))
                            data = json.loads(chunk)
                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
                            if choices := data.get("choices"):
                                # Note that text could be empty here
                                # e.g. for special tokens
                                text = choices[0].get("text")
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
                                    first_chunk_received = True
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft
                                # Decoding phase
                                else:
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)
                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get(
                                    "completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
                            "This response will be marked as failed!")
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))
    if pbar:
        pbar.update(1)
    return output
 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using OpenAI"""
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("transcriptions", "translations"
         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
    "or `translations`."
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "language": "en",
            # Flattened due to multipart/form-data
            "stream_include_usage": True,
            "stream_continuous_usage_stats": True
        }
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
        # Send audio file
        def to_bytes(y, sr):
            buffer = io.BytesIO()
            soundfile.write(buffer, y, sr, format="WAV")
            buffer.seek(0)
            return buffer
        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
            form = aiohttp.FormData()
            form.add_field('file', f, content_type='audio/wav')
            for key, value in payload.items():
                form.add_field(key, str(value))
            output = RequestFuncOutput()
            output.prompt_len = request_func_input.prompt_len
            generated_text = ""
            ttft = 0.0
            st = time.perf_counter()
            most_recent_timestamp = st
            try:
                async with session.post(url=api_url,
                                        data=form,
                                        headers=headers) as response:
                    if response.status == 200:
                        async for chunk_bytes in response.content:
                            chunk_bytes = chunk_bytes.strip()
                            if not chunk_bytes:
                                continue
                            chunk = chunk_bytes.decode("utf-8").removeprefix(
                                "data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)
                                if choices := data.get("choices"):
                                    content = choices[0]["delta"].get(
                                        "content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
                                        output.ttft = ttft
                                    # Decoding phase
                                    else:
                                        output.itl.append(
                                            timestamp - most_recent_timestamp)
                                    generated_text += content or ""
                                elif usage := data.get("usage"):
                                    output.output_tokens = usage.get(
                                        "completion_tokens")
                                most_recent_timestamp = timestamp
                        output.generated_text = generated_text
                        output.success = True
                        output.latency = most_recent_timestamp - st
                    else:
                        output.error = response.reason or ""
                        output.success = False
            except Exception:
                output.success = False
                exc_info = sys.exc_info()
                output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
        return output
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
    "vllm": async_request_openai_completions,
    "lmdeploy": async_request_openai_completions,
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_eb_openai_completions,
    "openai-chat": async_request_eb_openai_chat_completions,
    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
 OPENAI_COMPATIBLE_BACKENDS = [
    k for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions,
             async_request_eb_openai_chat_completions)
 ]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,309 @@
 """
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 # This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
 import base64
 import io
 import json
 import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from io import BytesIO
 from typing import Any, Callable, Optional, Union
 from PIL import Image
 logger = logging.getLogger(__name__)
@dataclass
 class SampleRequest:
    """
    Represents a single inference request for benchmarking.
    """
    prompt: Union[str, Any]
    history_QA: Union[str, Any]
    json_data: Optional[dict]
    prompt_len: int
    expected_output_len: int
 class BenchmarkDataset(ABC):
    """BenchmarkDataset"""
    DEFAULT_SEED = 0
    IS_MULTIMODAL = False
    def __init__(
        self,
        dataset_path: Optional[str] = None,
        random_seed: int = DEFAULT_SEED,
        hyperparameter_path: Optional[str] = None,
    ) -> None:
        """
        Initialize the BenchmarkDataset with an optional dataset path and random
        seed.  Args:
            dataset_path (Optional[str]): Path to the dataset. If None, it
            indicates that a default or random dataset might be used.
            random_seed (int): Seed value for reproducible shuffling or
            sampling. Defaults to DEFAULT_SEED.
        """
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
        self.random_seed = (random_seed
                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None
        self.hyperparameter_path = hyperparameter_path
        self.hyperparameters = {}
    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.
        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.
        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
        raise NotImplementedError(
            "load_data must be implemented in subclasses.")
    @abstractmethod
    def sample(self, num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.
        Args:
            num_requests (int): The number of sample requests to generate.
        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")
    def maybe_oversample_requests(self, requests: list[SampleRequest],
                                  num_requests: int) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.
        Args:
            requests (List[SampleRequest]): The current list of sampled
            requests.  num_requests (int): The target number of requests.
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
            additional = random.choices(requests,
                                        k=num_requests - len(requests))
            requests.extend(additional)
            logger.info("Oversampled requests to reach %d total samples.",
                        num_requests)
 def is_valid_sequence(
    prompt_len: int,
    output_len: int,
    min_len: int = 4,
    max_prompt_len: int = 1024,
    max_total_len: int = 2048,
    skip_min_output_len_check: bool = False,
 ) -> bool:
    """
    Validate a sequence based on prompt and output lengths.
    Default pruning criteria are copied from the original `sample_hf_requests`
    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
    from `sample_requests` in benchmark_throughput.py.
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
    output_too_short = (not skip_min_output_len_check) and (output_len
                                                            < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len
    # Return True if none of the invalid conditions are met
    return not (prompt_too_short or output_too_short or prompt_too_long
                or combined_too_long)
 def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.
    Supports three input types:
    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
       a dictionary with the image as a base64 data URL.
    3. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.
    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(image, dict) and 'bytes' in image:
        image = Image.open(BytesIO(image['bytes']))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
            image_base64 = base64.b64encode(
                image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_base64}"
            },
        }
    if isinstance(image, str):
        image_url = (image if image.startswith(
            ("http://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}
    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
                     " or str or dictionary with raw image bytes.")
 class EBDataset(BenchmarkDataset):
    """
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """
    temperature: float
    repetition_penalty: float
    frequency_penalty: float
    presence_penalty: float
    top_p: float
    prompt_len: int
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()
    def load_data(self) -> None:
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]
    def sample(
        self,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
            prompt = entry["text"]
            self.temperature = float(entry["temperature"])
            self.repetition_penalty = float(entry["penalty_score"])
            self.frequency_penalty = float(entry["frequency_score"])
            self.presence_penalty = float(entry["presence_score"])
            self.top_p = float(entry["topp"])
            self.prompt_len = int(entry["input_token_num"])
            new_output_len = int(entry["max_dec_len"])
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=self.prompt_len,
                    history_QA=[],
                    expected_output_len=new_output_len,
                ))
        self.maybe_oversample_requests(samples, num_requests)
        return samples
 class EBChatDataset(BenchmarkDataset):
    """
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """
    prompt_len: int
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()
    def load_data(self) -> None:
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]
    def sample(
        self,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
            json_data = entry
            prompt = entry["messages"][-1].get("content", "")
            history_QA = entry.get("messages", [])
            new_output_len = int(entry.get("max_tokens", 12288))
            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(
                    prompt, None)
            samples.append(
                SampleRequest(
                    json_data=json_data,
                    prompt=prompt,
                    prompt_len=0,
                    history_QA=history_QA,
                    expected_output_len=new_output_len,
                ))
        self.maybe_oversample_requests(samples, num_requests)
        return samples
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,90 @@
 """
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 # This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py
 import argparse
 import json
 import math
 import os
 from typing import Any
 def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
                                        metrics: dict[str, list],
                                        extra_info: dict[str, Any]) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
    """
    records = []
    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
        return records
    for name, benchmark_values in metrics.items():
        record = {
            "benchmark": {
                "name": "vLLM benchmark",
                "extra_info": {
                    "args": vars(args),
                },
            },
            "model": {
                "name": args.model,
            },
            "metric": {
                "name": name,
                "benchmark_values": benchmark_values,
                "extra_info": extra_info,
            },
        }
        tp = record["benchmark"]["extra_info"]["args"].get(
            "tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
            record["benchmark"]["extra_info"]["args"][
                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
        records.append(record)
    return records
 class InfEncoder(json.JSONEncoder):
    """InfEncoder"""
    def clear_inf(self, o: Any):
        """clear_inf"""
        if isinstance(o, dict):
            return {k: self.clear_inf(v) for k, v in o.items()}
        elif isinstance(o, list):
            return [self.clear_inf(v) for v in o]
        elif isinstance(o, float) and math.isinf(o):
            return "inf"
        return o
    def iterencode(self, o: Any, *args, **kwargs) -> Any:
        """iterencode"""
        return super().iterencode(self.clear_inf(o), *args, **kwargs)
 def write_to_json(filename: str, records: list) -> None:
    """write_to_json"""
    with open(filename, "w") as f:
        json.dump(records, f, cls=InfEncoder)
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,5 @@
 aiohttp
 tqdm
 numpy
 Pillow
 pyyaml
--- a/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-a800-tp8.yaml
@@ -0,0 +1,8 @@
 enable_chunked_prefill: True
 max_model_len: 131072
 max_num_seqs: 16
 kv_cache_ratio: 0.75
 tensor_parallel_size: 8
 max_num_batched_tokens: 4096
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
 max_model_len: 131072
 max_num_seqs: 40
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 8
 quantization: wint4
--- a/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-128k-wint8-a800-tp8.yaml
@@ -0,0 +1,8 @@
 enable_chunked_prefill: True
 max_model_len: 131072
 max_num_seqs: 16
 kv_cache_ratio: 0.75
 tensor_parallel_size: 8
 max_num_batched_tokens: 4096
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
@@ -0,0 +1,10 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 128
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 1
 enable_chunked_prefill: True
 max_num_batched_tokens: 384
 quantization: wint4
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-bf16.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4-a10.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 32
 kv_cache_ratio: 0.5
 tensor_parallel_size: 1
 quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 max_num_batched_tokens: 32768
 quantization: wint4
--- a/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-wint8.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 max_num_batched_tokens: 32768
 quantization: wint8
--- a/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 max_num_batched_tokens: 32768
--- a/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-blockwise-fp8-h800-tp8.yaml
@@ -0,0 +1,12 @@
 max_model_len: 32768
 max_num_seqs: 256
 tensor_parallel_size: 8
 quantization: block_wise_fp8
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 enable_chunked_prefill: True
 max_num_batched_tokens: 1024
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
 enable_prefix_caching: True
 swap_space: 200
--- a/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-tensorwise-fp8-h800-tp8.yaml
@@ -0,0 +1,11 @@
 max_model_len: 32768
 max_num_seqs: 256
 tensor_parallel_size: 8
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 enable_chunked_prefill: True
 max_num_batched_tokens: 1024
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
 enable_prefix_caching: True
 swap_space: 200
--- a/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-a800-tp4.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
@@ -0,0 +1,15 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 4
 cache_queue_port: 55663
 enable_chunked_prefill: True
 splitwise_role: decode
 engine_worker_queue_port: 6678
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7671,7672,7673,7674"
 pd_comm_port: "2334"
 max_num_batched_tokens: 384
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
@@ -0,0 +1,12 @@
 max_model_len: 32768
 max_num_seqs: 16
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.9
 tensor_parallel_size: 4
 splitwise_role: prefill
 enable_prefix_caching: True
 cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
 pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
+++ b/benchmarks/yaml/eb45-32k-wint2-h20-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_prefix_caching: true
 enable_chunked_prefill: true
--- a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_decode.yaml
@@ -0,0 +1,13 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 1
 data_parallel_size: 8
 num_gpu_blocks_override: 1024
 cache_queue_port: 55663
 splitwise_role: decode
 engine_worker_queue_port: 6678
 cache_transfer_protocol: "rdma"
 rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
 pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
@@ -0,0 +1,13 @@
 max_model_len: 32768
 max_num_seqs: 16
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.9
 tensor_parallel_size: 1
 data_parallel_size: 8
 splitwise_role: prefill
 cache_queue_port: 55664
 engine_worker_queue_port: 6677
 num_gpu_blocks_override: 1024
 cache_transfer_protocol: "rdma"
 rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
 pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
 quantization: wint4
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
@@ -0,0 +1,13 @@
 max_model_len: 32768
 max_num_seqs: 128
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.7
 tensor_parallel_size: 4
 cache_queue_port: 55663
 enable_chunked_prefill: False
 enable_prefix_caching: False
 splitwise_role: decode
 engine_worker_queue_port: 6678
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7671,7672,7673,7674"
 pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
@@ -0,0 +1,12 @@
 max_model_len: 32768
 max_num_seqs: 16
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.9
 tensor_parallel_size: 4
 splitwise_role: prefill
 enable_prefix_caching: False
 cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
 pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 40
 tensor_parallel_size: 4
 quantization: wint4
 gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 160
 tensor_parallel_size: 8
 quantization: wint4
 gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-prefixcache-a800-tp4.yaml
@@ -0,0 +1,8 @@
 enable_prefix_caching: True
 max_model_len: 32768
 max_num_seqs: 128
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
 swap_space: 200
 cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
@@ -0,0 +1,15 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 4
 cache_queue_port: 55663
 enable_chunked_prefill: True
 splitwise_role: decode
 engine_worker_queue_port: 6678
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7671,7672,7673,7674"
 pd_comm_port: "2334"
 max_num_batched_tokens: 384
 max_num_partial_prefills: 3
 max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
@@ -0,0 +1,12 @@
 max_model_len: 32768
 max_num_seqs: 16
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.9
 tensor_parallel_size: 4
 splitwise_role: prefill
 enable_prefix_caching: True
 cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
 pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 8
--- a/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 80
 tensor_parallel_size: 8
 quantization: wint8
 gpu_memory_utilization: 0.9
--- a/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-prefixcache-a800-tp8.yaml
@@ -0,0 +1,9 @@
 enable_prefix_caching: True
 max_model_len: 32768
 max_num_batched_tokens: 68304
 max_num_seqs: 128
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 8
 swap_space: 100
 cache_queue_port: 55664
--- a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
@@ -0,0 +1,9 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 56
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 8
 quantization: wint4
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,11 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 56
 gpu_memory_utilization: 0.8
 kv_cache_ratio: 0.8
 tensor_parallel_size: 8
 quantization: wint4
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 enable_chunked_prefill: True
 max_num_batched_tokens: 384
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-tp4.yaml
@@ -0,0 +1,9 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 36
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 4
 quantization: wint4
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-a800-tp8.yaml
@@ -0,0 +1,9 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 36
 gpu_memory_utilization: 0.95
 kv_cache_ratio: 0.8
 tensor_parallel_size: 8
 quantization: wint8
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,11 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 36
 gpu_memory_utilization: 0.8
 kv_cache_ratio: 0.8
 tensor_parallel_size: 8
 quantization: wint8
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 enable_chunked_prefill: True
 max_num_batched_tokens: 384
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint8-tp4.yaml
@@ -0,0 +1,9 @@
 enable_mm: True
 max_model_len: 32768
 max_num_seqs: 36
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.8
 tensor_parallel_size: 4
 quantization: wint8
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint4
 enable_static_graph_inference: True
--- a/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
+++ b/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,4 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wfp8afp8
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wfp8afp8
--- a/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint4
 enable_static_graph_inference: True
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a30-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a30-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 quantization: wint8
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-a800-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 quantization: wint8
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3dot6b-32k-wint8-h800-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 256
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 quantization: wint8
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 75
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.75
 quantization: wint4
 tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 25
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 quantization: wint8
 tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-a800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 50
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-bf16-h800-tp1.yaml
@@ -0,0 +1,5 @@
 max_model_len: 32768
 max_num_seqs: 50
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-a800-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 50
 gpu_memory_utilization: 0.8
 kv_cache_ratio: 0.75
 quantization: wint4
 tensor_parallel_size: 1
--- a/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/qwen3moe30b-32k-wint4-h800-tp1.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 50
 gpu_memory_utilization: 0.8
 kv_cache_ratio: 0.75
 quantization: wint4
 tensor_parallel_size: 1
--- a/benchmarks/yaml/request_yaml/eb45-128k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-128k.yaml
@@ -0,0 +1,8 @@
 top_p: 0.8
 temperature: 0.8
 metadata:
  min_tokens: 1
 max_tokens: 131071
 repetition_penalty: 1.0
 frequency_penalty: 0
 presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/eb45-32k.yaml
+++ b/benchmarks/yaml/request_yaml/eb45-32k.yaml
@@ -0,0 +1,8 @@
 top_p: 0.8
 temperature: 0.8
 metadata:
  min_tokens: 1
 max_tokens: 12288
 repetition_penalty: 1.0
 frequency_penalty: 0
 presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
@@ -0,0 +1,8 @@
 top_p: 0.8
 temperature: 0.7
 metadata:
  min_tokens: 1
 max_tokens: 12288
 repetition_penalty: 1.05
 frequency_penalty: 0
 presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
@@ -0,0 +1,8 @@
 top_p: 0.8
 temperature: 0.7
 metadata:
  min_tokens: 1
 max_tokens: 12288
 repetition_penalty: 1.0
 frequency_penalty: 0
 presence_penalty: 1.5
--- a/benchmarks/yaml/request_yaml/x1-32k.yaml
+++ b/benchmarks/yaml/request_yaml/x1-32k.yaml
@@ -0,0 +1,8 @@
 top_p: 0.95
 temperature: 0.6
 metadata:
  min_tokens: 1
 max_tokens: 32767
 repetition_penalty: 1.0
 frequency_penalty: 0
 presence_penalty: 0
--- a/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-h800-tp8.yaml
@@ -0,0 +1,6 @@
 tensor_parallel_size: 8
 max_model_len: 32768
 max_num_seqs: 32
 num_gpu_blocks_override: 4096
 kv_cache_ratio: 0.5
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 32
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 4
 quantization: wint4
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-p800-tp8.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 128
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 8
 quantization: wint4
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint4-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
 enable_prefix_caching: True
 num_gpu_blocks_override: 8000
 max_model_len: 32768
 max_num_seqs: 64
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.5
 tensor_parallel_size: 8
 swap_space: 200
 cache_queue_port: 55664
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-h800-tp8.yaml
@@ -0,0 +1,6 @@
 tensor_parallel_size: 8
 max_model_len: 32768
 max_num_seqs: 32
 num_gpu_blocks_override: 4096
 kv_cache_ratio: 0.5
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp4.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 8
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 4
 quantization: wint8
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
@@ -0,0 +1,6 @@
 max_model_len: 32768
 max_num_seqs: 64
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 8
 quantization: wint8
 reasoning_parser: ernie-x1
--- a/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-prefixcache-h800-tp8.yaml
@@ -0,0 +1,10 @@
 enable_prefix_caching: True
 num_gpu_blocks_override: 8000
 max_model_len: 32768
 max_num_seqs: 64
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.5
 tensor_parallel_size: 8
 swap_space: 200
 cache_queue_port: 55664
 reasoning_parser: ernie-x1
--- a/build.sh
+++ b/build.sh
@@ -17,8 +17,9 @@
 BUILD_WHEEL=${1:-1}
 PYTHON_VERSION=${2:-"python"}
 export python=$PYTHON_VERSION
-CPU_USE_BF16=${3:-"false"}
+FD_CPU_USE_BF16=${3:-"false"}
-BUILDING_ARCS=${4:-""}
+FD_BUILDING_ARCS=${4:-""}
 # paddle distributed use to set archs
 unset PADDLE_CUDA_ARCH_LIST
@@ -30,13 +31,9 @@ EGG_DIR="fastdeploy.egg-info"
 # custom_ops directory config
 OPS_SRC_DIR="custom_ops"
 OPS_BUILD_DIR="build"
 OPS_EGG_DIR="efficitentllm_ops.egg-info"
 OPS_TMP_DIR_BASE="tmp_base"
 OPS_TMP_DIR="tmp"
 TEST_DIR="tests"
 # command line log config
 RED='\033[0;31m'
 BLUE='\033[0;34m'
@@ -44,13 +41,14 @@ GREEN='\033[1;32m'
 BOLD='\033[1m'
 NONE='\033[0m'
 DEVICE_TYPE="gpu"
 function python_version_check() {
  PY_MAIN_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1}'`
  PY_SUB_VERSION=`${python} -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $2}'`
  echo -e "find python version ${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
-  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "8" ]; then
+  if [ $PY_MAIN_VERSION -ne "3" -o $PY_SUB_VERSION -lt "9" ]; then
-    echo -e "${RED}FAIL:${NONE} please use Python >= 3.8"
+    echo -e "${RED}FAIL:${NONE} please use Python >= 3.9"
    exit 1
  fi
 }
@@ -75,6 +73,7 @@ function copy_ops(){
    WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
    is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
    if [ "$is_rocm" = "True" ]; then
      DEVICE_TYPE="rocm"
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
      echo -e "ROCM ops have been copy to fastdeploy"
      return
@@ -82,6 +81,7 @@ function copy_ops(){
    mkdir -p ../fastdeploy/model_executor/ops/base
    is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
    if [ "$is_cuda" = "True" ]; then
      DEVICE_TYPE="gpu"
      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
      echo -e "BASE and CUDA ops have been copy to fastdeploy"
@@ -90,6 +90,7 @@ function copy_ops(){
    is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
    if [ "$is_xpu" = "True" ]; then
      DEVICE_TYPE="xpu"
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/xpu
      echo -e "xpu ops have been copy to fastdeploy"
      return
@@ -97,20 +98,14 @@ function copy_ops(){
    is_npu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('npu'))"`
    if [ "$is_npu" = "True" ]; then
      DEVICE_TYPE="npu"
      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/npu
      echo -e "npu ops have been copy to fastdeploy"
      return
    fi
    DEVICE_TYPE="cpu"
    cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
    cd ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/xFasterTransformer/build/
    for file in *_pd_.so; do
      mv "$file" "${file/_pd_/}"
    done
    cd ../../x86-simd-sort/builddir/
    for file in *_pd_.so; do
      mv "$file" "${file/_pd_/}"
    done
    cd ../../../../
    cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
    echo -e "BASE and CPU ops have been copy to fastdeploy"
@@ -122,15 +117,30 @@ function build_and_install_ops() {
  export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
  echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
  ${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
  find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
  echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
-  if [ "$CPU_USE_BF16" == "true" ]; then
+  TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
-      CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
+  is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
-      :
+  if [ "$is_xpu" = "True" ]; then
-  elif [ "$CPU_USE_BF16" == "false" ]; then
+    cd xpu_ops/src
    bash build.sh ${TMP_DIR_REAL_PATH}
    cd ../..
  elif [ "$FD_CPU_USE_BF16" == "true" ]; then
    if [ "$FD_BUILDING_ARCS" == "" ]; then
      FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
    else
      FD_BUILDING_ARCS=${FD_BUILDING_ARCS} FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
    fi
    find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
  elif [ "$FD_CPU_USE_BF16" == "false" ]; then
    if [ "$FD_BUILDING_ARCS" == "" ]; then
      ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
-      :
+    else
      FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
    fi
    find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
  else
-      echo "Error: Invalid parameter '$CPU_USE_BF16'. Please use true or false."
+      echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
      exit 1
  fi
  if [ $? -ne 0 ]; then
@@ -146,11 +156,7 @@ function build_and_install_ops() {
 function build_and_install() {
  echo -e "${BLUE}[build]${NONE} building fastdeploy wheel..."
-  if [ "$BUILDING_ARCS" == "" ]; then
+  ${python} setup.py bdist_wheel --python-tag=py3
      ${python} setup.py bdist_wheel --python-tag py3
  else
      BUILDING_ARCS=${BUILDING_ARCS} ${python} setup.py bdist_wheel --python-tag py3
  fi
  if [ $? -ne 0 ]; then
    echo -e "${RED}[FAIL]${NONE} build fastdeploy wheel failed"
@@ -174,10 +180,12 @@ function cleanup() {
  rm -rf $BUILD_DIR $EGG_DIR
  if [ `${python} -m pip list | grep fastdeploy | wc -l` -gt 0  ]; then
    echo -e "${BLUE}[init]${NONE} uninstalling fastdeploy..."
-    ${python} -m pip uninstall -y fastdeploy
+    ${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
  fi
  rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
  rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
  rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
 }
 function abort() {
@@ -187,7 +195,7 @@ function abort() {
  cur_dir=`basename "$pwd"`
  rm -rf $BUILD_DIR $EGG_DIR $DIST_DIR
-  ${python} -m pip uninstall -y fastdeploy
+  ${python} -m pip uninstall -y fastdeploy-${DEVICE_TYPE}
  rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
 }
--- a/custom_ops/0001-DeepGEMM-95e81b3.patch
+++ b/custom_ops/0001-DeepGEMM-95e81b3.patch
@@ -0,0 +1,643 @@
 From 5112002c155dceecc5e5983cdb67157e4f5400e2 Mon Sep 17 00:00:00 2001
 From: minghaipeng <minghaipeng@baidu.com>
 Date: Wed, 25 Jun 2025 15:05:24 +0800
 Subject: [PATCH] DeepGEMM 95e81b3
 ---
 deep_gemm/__init__.py                     |  2 +-
 deep_gemm/include/deep_gemm/scheduler.cuh |  2 +-
 deep_gemm/jit/compiler.py                 |  2 +-
 deep_gemm/jit/interleave_ffma.py          |  2 +-
 deep_gemm/jit/runtime.py                  |  4 +-
 deep_gemm/jit/template.py                 | 34 ++++----
 deep_gemm/jit_kernels/gemm.py             | 44 +++++------
 deep_gemm/jit_kernels/m_grouped_gemm.py   | 96 +++++++++++------------
 deep_gemm/jit_kernels/tuner.py            | 10 +--
 deep_gemm/jit_kernels/utils.py            | 18 +++--
 deep_gemm/paddle_utils.py                 | 20 +++++
 deep_gemm/utils.py                        | 30 +++----
 12 files changed, 143 insertions(+), 121 deletions(-)
 create mode 100644 deep_gemm/paddle_utils.py
 diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py
 index 15b22ca..63e7fb7 100644
 --- a/deep_gemm/__init__.py
 +++ b/deep_gemm/__init__.py
@@ -1,4 +1,4 @@
 -import torch
 +import paddle
 from . import jit
 from .jit_kernels import (
 diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh
 index 9743871..6c97152 100644
 --- a/deep_gemm/include/deep_gemm/scheduler.cuh
 +++ b/deep_gemm/include/deep_gemm/scheduler.cuh
@@ -102,7 +102,7 @@ struct Scheduler {
         if constexpr (kGemmType == GemmType::Normal) {
             return block_idx * block_size;
         } else if constexpr (kGemmType == GemmType::GroupedContiguous) {
 -            auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M);
 +            auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M));
             return offset * shape_dim + block_idx * block_size;
         } else if constexpr (kGemmType == GemmType::GroupedMasked) {
             return curr_group_idx * shape_dim + block_idx * block_size;
 diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py
 index c17d466..6fdc52f 100644
 --- a/deep_gemm/jit/compiler.py
 +++ b/deep_gemm/jit/compiler.py
@@ -4,7 +4,7 @@ import os
 import re
 import subprocess
 import uuid
 -from torch.utils.cpp_extension import CUDA_HOME
 +from ..paddle_utils import CUDA_HOME
 from typing import Tuple
 from . import interleave_ffma
 diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/interleave_ffma.py
 index fcb377e..db9d6f3 100644
 --- a/deep_gemm/jit/interleave_ffma.py
 +++ b/deep_gemm/jit/interleave_ffma.py
@@ -3,7 +3,7 @@ import mmap
 import os
 import re
 import subprocess
 -from torch.utils.cpp_extension import CUDA_HOME
 +from ..paddle_utils import CUDA_HOME
 def run_cuobjdump(file_path):
 diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py
 index 66c370a..4761426 100644
 --- a/deep_gemm/jit/runtime.py
 +++ b/deep_gemm/jit/runtime.py
@@ -1,6 +1,6 @@
 import ctypes
 import os
 -import torch
 +import paddle
 from typing import Optional
 from .template import map_ctype
@@ -35,7 +35,7 @@ class Runtime:
         assert len(args) == len(self.args), f'Expected {len(self.args)} arguments, got {len(args)}'
         cargs = []
         for arg, (name, dtype) in zip(args, self.args):
 -            if isinstance(arg, torch.Tensor):
 +            if isinstance(arg, paddle.Tensor):
                 assert arg.dtype == dtype, f'Expected tensor dtype `{dtype}` for `{name}`, got `{arg.dtype}`'
             else:
                 assert isinstance(arg, dtype), f'Expected built-in type `{dtype}` for `{name}`, got `{type(arg)}`'
 diff --git a/deep_gemm/jit/template.py b/deep_gemm/jit/template.py
 index ead37f5..51b02c1 100644
 --- a/deep_gemm/jit/template.py
 +++ b/deep_gemm/jit/template.py
@@ -1,24 +1,24 @@
 import copy
 import ctypes
 import os
 -import torch
 +import paddle
 from typing import Any, Dict, Iterable, Tuple
 # Name map for Python `eval`
 typename_map: Dict[Any, str] = {
     **{t: t.__name__ for t in (bool, int, float)},
 -    torch.int: 'torch.int',
 -    torch.float: 'torch.float',
 -    torch.bfloat16: 'torch.bfloat16',
 -    torch.float8_e4m3fn: 'torch.float8_e4m3fn',
 -    torch.cuda.Stream: 'torch.cuda.Stream',
 +    paddle.int32: 'paddle.int32',
 +    paddle.float32: 'paddle.float32',
 +    paddle.bfloat16: 'paddle.bfloat16',
 +    paddle.float8_e4m3fn: 'paddle.float8_e4m3fn',
 +    paddle.device.cuda.Stream: "paddle.device.cuda.Stream",
 }
 # `ctype` map for Python casting
 ctype_map: Dict[Any, Any] = {
     **{t: getattr(ctypes, f'c_{t.__name__}') for t in (bool, int, float)},
 -    **{t: ctypes.c_void_p for t in (torch.int, torch.float, torch.bfloat16, torch.float8_e4m3fn, torch.cuda.Stream)},
 +    **{t: ctypes.c_void_p for t in (paddle.int32, paddle.float32, paddle.bfloat16, paddle.float8_e4m3fn, paddle.device.cuda.Stream)},
 }
@@ -27,25 +27,25 @@ genc_map = {
     bool: ('bool', 'bool'),
     int: ('int', 'int'),
     float: ('float', 'float'),
 -    torch.int: ('void*', 'int*'),
 -    torch.float: ('void*', 'float*'),
 -    torch.bfloat16: ('void*', '__nv_bfloat16*'),
 -    torch.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
 -    torch.cuda.Stream: ('void*', 'cudaStream_t'),
 +    paddle.int32: ('void*', 'int*'),
 +    paddle.float32: ('void*', 'float*'),
 +    paddle.bfloat16: ('void*', '__nv_bfloat16*'),
 +    paddle.float8_e4m3fn: ('void*', '__nv_fp8_e4m3*'),
 +    paddle.device.cuda.Stream: ('void*', 'cudaStream_t'),
 }
 def map_ctype(value: Any) -> Any:
     if hasattr(value, 'data_ptr'):
 -        if value.dtype == torch.int:
 +        if value.dtype == paddle.int32:
             return ctypes.c_void_p(value.data_ptr())
 -        elif value.dtype == torch.float:
 +        elif value.dtype == paddle.float32:
             return ctypes.c_void_p(value.data_ptr())
 -        elif value.dtype == torch.bfloat16:
 +        elif value.dtype == paddle.bfloat16:
             return ctypes.c_void_p(value.data_ptr())
 -        elif value.dtype == torch.float16:
 +        elif value.dtype == paddle.float16:
             return ctypes.c_void_p(value.data_ptr())
 -        elif value.dtype == torch.float8_e4m3fn:
 +        elif value.dtype == paddle.float8_e4m3fn:
             return ctypes.c_void_p(value.data_ptr())
         else:
             return ctypes.c_void_p(value.data_ptr())
 diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py
 index cb438b7..44aa0ed 100644
 --- a/deep_gemm/jit_kernels/gemm.py
 +++ b/deep_gemm/jit_kernels/gemm.py
@@ -1,5 +1,5 @@
 import math
 -import torch
 +import paddle
 from functools import lru_cache
 from typing import Tuple
@@ -166,20 +166,20 @@ def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int,
     return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config
 -def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
 -                         rhs: Tuple[torch.Tensor, torch.Tensor],
 -                         out: torch.Tensor) -> None:
 +def gemm_fp8_fp8_bf16_nt(lhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                         rhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                         out: paddle.Tensor) -> None:
     """
     Do a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
     RHS and RHS scaling factors are required to be transposed.
     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
 -        this function will do a transposing with a set of slow PyTorch operations.
 +        this function will do a transposing with a set of slow paddle operations.
     Arguments:
 -        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`,
 +        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m, k]`,
              the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`.
 -        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`.
 +        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[n, k]`.
              the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`.
         out: the BF16 output tensor of shape `[m, n]`, representing the result.
     """
@@ -189,22 +189,22 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
     n, k_ = rhs.shape
     m_, n_ = out.shape
 -    assert n % 64 == 0 and k % 128 == 0
 +    # assert n % 64 == 0 and k % 128 == 0
     # Type and shape checks
 -    assert m == m_ and n == n_ and k == k_
 -    assert n > 0 and k > 0
 -    assert lhs_scales.shape == (m, (k + 127) // 128)
 -    assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
 -    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
 -    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
 -    assert out.dtype == torch.bfloat16
 -    assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
 +    # assert m == m_ and n == n_ and k == k_
 +    # assert n > 0 and k > 0
 +    # assert lhs_scales.shape == (m, (k + 127) // 128)
 +    # assert rhs_scales.shape == ((n + 127) // 128, (k + 127) // 128)
 +    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
 +    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
 +    # assert out.dtype == paddle.bfloat16
 +    # assert lhs.is_contiguous() and rhs.is_contiguous() and out.is_contiguous()
     # LHS scales must be transposed for TMA load, but not for RHS scales
     # NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
 -    assert rhs_scales.is_contiguous()
 +    # assert rhs_scales.is_contiguous()
     # Do nothing if `m` is zero
     if m == 0:
@@ -214,7 +214,7 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
     global includes, template
     num_sms = get_num_sms()
     num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms)
 -    args = (lhs, lhs_scales, rhs, rhs_scales, out, m, torch.cuda.current_stream(), num_sms, smem_config[0])
 +    args = (lhs, lhs_scales, rhs, rhs_scales, out, m, paddle.device.cuda.current_stream(), num_sms, smem_config[0])
     runtime = jit_tuner.compile_and_tune(
         name='gemm_fp8_fp8_bf16_nt',
         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -225,10 +225,10 @@ def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor],
               'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1]},
         space=(),
         includes=includes,
 -        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
 -                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
 -                  ('out', torch.bfloat16), ('m', int),
 -                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
 +        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
 +                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
 +                  ('out', paddle.bfloat16), ('m', int),
 +                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
         template=template,
         args=args
     )
 diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py
 index 3b518c9..ba776bd 100644
 --- a/deep_gemm/jit_kernels/m_grouped_gemm.py
 +++ b/deep_gemm/jit_kernels/m_grouped_gemm.py
@@ -1,4 +1,4 @@
 -import torch
 +import paddle
 from typing import Tuple
 from .gemm import get_best_configs, get_block_n_padding_for_smem_d
@@ -37,25 +37,25 @@ gemm_t::run(out, rhs_scales, grouped_layout,
 """
 -def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor],
 -                                              rhs: Tuple[torch.Tensor, torch.Tensor],
 -                                              out: torch.Tensor, m_indices: torch.Tensor) -> None:
 +def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                                              rhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                                              out: paddle.Tensor, m_indices: paddle.Tensor) -> None:
     """
     Do a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
     RHS and RHS scaling factors are required to be transposed.
     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
 -        this function will do a transposing with a set of slow PyTorch operations.
 +        this function will do a transposing with a set of slow Pypaddle operations.
     On the M axis, inputs are grouped into several batches, of which batch sizes aligned to
         `get_m_alignment_for_contiguous_layout()` (128).
     Arguments:
 -        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`,
 +        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[m_sum, k]`,
              the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`.
 -        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
 +        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
              the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
         out: the BF16 output tensor of shape `[m_sum, n]`, representing the result.
 -        m_indices: a tensor of shape `[m_sum]` with type `torch.int`.
 +        m_indices: a tensor of shape `[m_sum]` with type `paddle.int`.
             `m_indices[i]` records the group which the i-th row of the LHS belong to,
             which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`.
             Values of `m_indices` in every-m-alignment-block must also be the same.
@@ -68,19 +68,19 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
     m__ = m_indices.numel()
     # Type and shape checks
 -    assert m == m_ == m__ and k == k_ and n == n_
 -    assert lhs_scales.shape == (m, (k + 127) // 128)
 -    assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
 -    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
 -    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
 -    assert out.dtype == torch.bfloat16
 -    assert m_indices.dtype == torch.int32
 -    assert lhs.is_contiguous() and rhs.is_contiguous()
 -    assert out.is_contiguous() and m_indices.is_contiguous()
 +    # assert m == m_ == m__ and k == k_ and n == n_
 +    # assert lhs_scales.shape == (m, (k + 127) // 128)
 +    # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
 +    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
 +    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
 +    # assert out.dtype == paddle.bfloat16
 +    # assert m_indices.dtype == paddle.int32
 +    # assert lhs.is_contiguous() and rhs.is_contiguous()
 +    # assert out.is_contiguous() and m_indices.is_contiguous()
     # LHS scales must be transposed for TMA load, but not for RHS scales
     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
 -    assert rhs_scales.is_contiguous()
 +    # assert rhs_scales.is_contiguous()
     # Do nothing if `m` is zero
     if m == 0:
@@ -92,7 +92,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
     num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms, is_grouped_contiguous=True)
     args = (lhs, lhs_scales, rhs, rhs_scales, out,
             m_indices, m, num_groups,
 -            torch.cuda.current_stream(), num_sms, smem_config[0])
 +            paddle.device.cuda.current_stream(), num_sms, smem_config[0])
     runtime = jit_tuner.compile_and_tune(
         name='m_grouped_gemm_fp8_fp8_bf16_nt',
         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -105,11 +105,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
               'GEMM_TYPE': 'GroupedContiguous'},
         space=(),
         includes=includes,
 -        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
 -                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
 -                  ('out', torch.bfloat16),
 -                  ('grouped_layout', torch.int32), ('m', int), ('num_groups', int),
 -                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
 +        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
 +                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
 +                  ('out', paddle.bfloat16),
 +                  ('grouped_layout', paddle.int32), ('m', int), ('num_groups', int),
 +                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
         template=template,
         args=args
     )
@@ -118,22 +118,22 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Ten
     runtime(*args)
 -def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor],
 -                                          rhs: Tuple[torch.Tensor, torch.Tensor],
 -                                          out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None:
 +def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                                          rhs: Tuple[paddle.Tensor, paddle.Tensor],
 +                                          out: paddle.Tensor, masked_m: paddle.Tensor, expected_m: int) -> None:
     """
     Do a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling.
     LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format.
     RHS and RHS scaling factors are required to be transposed.
     The LHS scaling tensor requires TMA-aligned transposed format, if your input does not match the requirement,
 -        this function will do a transposing with a set of slow PyTorch operations.
 +        this function will do a transposing with a set of slow paddle operations.
     Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch
         should be separately transposed.
     Arguments:
 -        lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
 +        lhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, m_max, k]`,
              the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`.
 -        rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`.
 +        rhs: the first element is an FP8 tensor (typed `paddle.float8_e4m3fn`) of shape `[num_groups, n, k]`.
              the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`.
         out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result.
         masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute
@@ -149,21 +149,21 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
     num_groups___ = masked_m.numel()
     # Type and shape checks
 -    assert num_groups == num_groups_ == num_groups__ == num_groups___
 -    assert m == m_ and n == n_ and k == k_
 -    assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
 -    assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
 -    assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
 -    assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32
 -    assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32
 -    assert out.dtype == torch.bfloat16
 -    assert masked_m.dtype == torch.int32
 -    assert lhs.is_contiguous() and rhs.is_contiguous()
 -    assert out.is_contiguous() and masked_m.is_contiguous()
 +    # assert num_groups == num_groups_ == num_groups__ == num_groups___
 +    # assert m == m_ and n == n_ and k == k_
 +    # assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0
 +    # assert lhs_scales.shape == (num_groups, m, (k + 127) // 128)
 +    # assert rhs_scales.shape == (num_groups, (n + 127) // 128, (k + 127) // 128)
 +    # assert lhs.dtype == paddle.float8_e4m3fn and lhs_scales.dtype == paddle.float32
 +    # assert rhs.dtype == paddle.float8_e4m3fn and rhs_scales.dtype == paddle.float32
 +    # assert out.dtype == paddle.bfloat16
 +    # assert masked_m.dtype == paddle.int32
 +    # assert lhs.is_contiguous() and rhs.is_contiguous()
 +    # assert out.is_contiguous() and masked_m.is_contiguous()
     # LHS scales must be transposed for TMA load, but not for RHS scales
     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
 -    assert rhs_scales.is_contiguous()
 +    # assert rhs_scales.is_contiguous()
     # Auto-tuning with compilation
     global includes, template
@@ -176,7 +176,7 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
     args = (lhs, lhs_scales, rhs, rhs_scales, out,
             masked_m, m,
 -            torch.cuda.current_stream(), num_sms, smem_config[0])
 +            paddle.device.cuda.current_stream(), num_sms, smem_config[0])
     runtime = jit_tuner.compile_and_tune(
         name='m_grouped_gemm_fp8_fp8_bf16_nt',
         keys={'N': n, 'K': k, 'BLOCK_M': block_m, 'BLOCK_N': block_n,
@@ -189,11 +189,11 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor]
               'GEMM_TYPE': 'GroupedMasked'},
         space=(),
         includes=includes,
 -        arg_defs=(('lhs', torch.float8_e4m3fn), ('lhs_scales', torch.float),
 -                  ('rhs', torch.float8_e4m3fn), ('rhs_scales', torch.float),
 -                  ('out', torch.bfloat16),
 -                  ('grouped_layout', torch.int32), ('m', int),
 -                  ('stream', torch.cuda.Stream), ('num_sms', int), ('smem_size', int)),
 +        arg_defs=(('lhs', paddle.float8_e4m3fn), ('lhs_scales', paddle.float32),
 +                  ('rhs', paddle.float8_e4m3fn), ('rhs_scales', paddle.float32),
 +                  ('out', paddle.bfloat16),
 +                  ('grouped_layout', paddle.int32), ('m', int),
 +                  ('stream', paddle.device.cuda.Stream), ('num_sms', int), ('smem_size', int)),
         template=template,
         args=args
     )
 diff --git a/deep_gemm/jit_kernels/tuner.py b/deep_gemm/jit_kernels/tuner.py
 index 6ed6749..9e1d70f 100644
 --- a/deep_gemm/jit_kernels/tuner.py
 +++ b/deep_gemm/jit_kernels/tuner.py
@@ -1,6 +1,6 @@
 import copy
 import os
 -import torch
 +import paddle
 from typing import Any, Dict
 from ..jit import build, cpp_format, generate, Runtime
@@ -51,10 +51,10 @@ class JITTuner:
                     continue
                 # Measure performance with L2 flush and a large GEMM kernel before to reduce overhead between kernels
 -                start_event = torch.cuda.Event(enable_timing=True)
 -                end_event = torch.cuda.Event(enable_timing=True)
 -                torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda').zero_()
 -                torch.randn((8192, 8192), dtype=torch.float, device='cuda') @ torch.randn((8192, 8192), dtype=torch.float, device='cuda')
 +                start_event = paddle.device.cuda.Event(enable_timing=True)
 +                end_event = paddle.device.cuda.Event(enable_timing=True)
 +                paddle.empty((int(256e6 // 4)), dtype=paddle.int32).zero_()
 +                paddle.randn((8192, 8192), dtype=paddle.float32) @ paddle.randn((8192, 8192), dtype=paddle.float32)
                 start_event.record()
                 for i in range(20):
                     assert runtime(*args) == 0
 diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py
 index c6da56b..a17b1b1 100644
 --- a/deep_gemm/jit_kernels/utils.py
 +++ b/deep_gemm/jit_kernels/utils.py
@@ -1,4 +1,4 @@
 -import torch
 +import paddle
 _num_sms = None
@@ -11,7 +11,7 @@ def set_num_sms(num_sms: int) -> None:
         num_sms: the desired maximum SM count for all GEMM kernels to use.
     """
     global _num_sms
 -    assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count
 +    assert 0 < num_sms <= paddle.device.cuda.get_device_properties().multi_processor_count
     _num_sms = num_sms
@@ -25,7 +25,7 @@ def get_num_sms() -> int:
     """
     global _num_sms
     if _num_sms is None:
 -        _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count
 +        _num_sms = paddle.device.cuda.get_device_properties().multi_processor_count
     return _num_sms
@@ -74,9 +74,9 @@ def get_tma_aligned_size(x: int, element_size: int) -> int:
     return ceil_div(x, alignment) * alignment
 -def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
 +def get_col_major_tma_aligned_tensor(x: paddle.Tensor) -> paddle.Tensor:
     """
 -    Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary.
 +    Returns TMA-aligned transposed format of the input tensor. `paddle.transpose` will be called if necessary.
     If the input tensor is already column-major layout and 16-byte aligned along the M axis
         (thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing.
@@ -92,18 +92,20 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
     m, n = x.shape[-2], x.shape[-1]
     aligned_m = get_tma_aligned_size(m, x.element_size())
     if x.dim() == 2:
 -        if x.stride(0) == 1 and x.stride(1) == aligned_m:
 +        if x.strides[0] == 1 and x.strides[1] == aligned_m:
             return x
         x, remove_dim = x.unsqueeze(0), True
     b = x.shape[0]
     # The last kernel gives a column-major TMA aligned layout
 -    if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m:
 +    if x.strides[0] == aligned_m * n and x.strides[1] == 1 and x.strides[2] == aligned_m:
         return x.squeeze(0) if remove_dim else x
     # Normal layout requires transposing
 -    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
 +    aligned_x = paddle.transpose(
 +        paddle.empty((b, n, aligned_m), dtype=x.dtype), perm=[0, 2, 1]
 +    )
     aligned_x[:, :m, :] = x
     aligned_x = aligned_x[:, :m, :]
     return aligned_x.squeeze(0) if remove_dim else aligned_x
 diff --git a/deep_gemm/paddle_utils.py b/deep_gemm/paddle_utils.py
 new file mode 100644
 index 0000000..2326807
 --- /dev/null
 +++ b/deep_gemm/paddle_utils.py
@@ -0,0 +1,20 @@
 +import os
 +
 +def get_cuda_home():
 +    """Get Cuda home directory"""
 +    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
 +    if cuda_home:
 +        return cuda_home
 +
 +    try:
 +        which_cmd = "which nvcc"
 +
 +        nvcc_path = os.popen(which_cmd).read().strip()
 +        if nvcc_path:
 +            return os.path.dirname(os.path.dirname(nvcc_path))
 +    except Exception:
 +        pass
 +
 +    return None
 +
 +CUDA_HOME = get_cuda_home()
 \ No newline at end of file
 diff --git a/deep_gemm/utils.py b/deep_gemm/utils.py
 index d5cdd01..5237f09 100644
 --- a/deep_gemm/utils.py
 +++ b/deep_gemm/utils.py
@@ -1,15 +1,15 @@
 import os
 import sys
 import time
 -import torch
 -import torch.distributed as dist
 +import paddle
 +import paddle.distributed as dist
 def bench(fn, num_warmups: int = 5, num_tests: int = 10,
           high_precision: bool = False):
     # Flush L2 cache with 256 MB data
 -    torch.cuda.synchronize()
 -    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
 +    paddle.device.cuda.synchronize()
 +    cache = paddle.empty((int(256e6 // 4)), dtype=paddle.int32)
     cache.zero_()
     # Warmup
@@ -18,18 +18,18 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10,
     # Add a large kernel to eliminate the CPU launch overhead
     if high_precision:
 -        x = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
 -        y = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
 +        x = paddle.randn((8192, 8192), dtype=paddle.float32)
 +        y = paddle.randn((8192, 8192), dtype=paddle.float32)
         x @ y
     # Testing
 -    start_event = torch.cuda.Event(enable_timing=True)
 -    end_event = torch.cuda.Event(enable_timing=True)
 +    start_event = paddle.device.cuda.Event(enable_timing=True)
 +    end_event = paddle.device.cuda.Event(enable_timing=True)
     start_event.record()
     for i in range(num_tests):
         fn()
     end_event.record()
 -    torch.cuda.synchronize()
 +    paddle.device.synchronize()
     return start_event.elapsed_time(end_event) / num_tests
@@ -106,21 +106,21 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output:
     # Profile
     suppress = suppress_stdout_stderr if suppress_kineto_output and not using_nsys else empty_suppress
     with suppress():
 -        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1) if not using_nsys else None
 -        profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress()
 +        scheduler = paddle.profiler.make_scheduler(closed=0, ready=1, record=1, repeat=1) if not using_nsys else None
 +        profiler = paddle.profiler.Profiler(targets=[paddle.profiler.ProfilerTarget.CPU, paddle.profiler.ProfilerTarget.GPU], scheduler=scheduler) if not using_nsys else empty_suppress()
         with profiler:
             for i in range(2):
                 # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                 if barrier_comm_profiling:
 -                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
 -                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
 +                    lhs = paddle.randn((8192, 8192), dtype=paddle.float32)
 +                    rhs = paddle.randn((8192, 8192), dtype=paddle.float32)
                     lhs @ rhs
 -                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
 +                    dist.all_reduce(paddle.ones(1, dtype=paddle.float32))
                 for _ in range(num_tests):
                     if sleep_between_tests > 0.0:
                         time.sleep(sleep_between_tests)
                     if flush_l2:
 -                        torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_()
 +                        paddle.empty(flush_l2_size, dtype=paddle.int32).zero_()
                     fn()
                 if not using_nsys:
 -- 
 2.43.0
--- a/custom_ops/cpu_ops/avx_weight_only.cc
+++ b/custom_ops/cpu_ops/avx_weight_only.cc
@@ -1,188 +0,0 @@
 // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "dtype.h"
 #include "matmul_helper.h"
 #include "my_types.h"
 #include "paddle/extension.h"
 #include "paddle/phi/core/kernel_registry.h"
 template <typename T>
 void AvxCompute(const paddle::Tensor &x,
                const paddle::Tensor &weight,
                const paddle::Tensor &w_bias,
                bool trans,
                const std::string alog,
                paddle::Tensor &out,
                xft::Matrix<T> &quantizedWeight,
                xft::Vector<float> &WeightScale,
                xft::Vector<float> &WeightZero,
                xft::Vector<float> &WeightSum,
                MMHelper *mmHelper) {
    auto out_data = out.data<float>();
    const float *x_data = reinterpret_cast<const float *>(x.data<float>());
    const float *bias_data = nullptr;
    if (w_bias.initialized()) {
        bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
    }
    int m = 1;
    for (int i = 0; i < x.shape().size() - 1; i++) {
        m = m * x.shape()[i];
    }
    int k = x.shape()[x.shape().size() - 1];
    int l = weight.shape()[1];
    int n = weight.shape()[1];
    if (w_bias.initialized()) {
        mmHelper->compute_bias(false,
                               m,
                               n,
                               k,
                               1.0f,
                               x_data,
                               k,
                               quantizedWeight.Data(),
                               WeightScale.Data(),
                               WeightZero.Data(),
                               WeightSum.Data(),
                               0.0f,
                               out_data,
                               l,
                               bias_data);
    } else {
        mmHelper->compute(false,
                          m,
                          n,
                          k,
                          1.0f,
                          x_data,
                          k,
                          quantizedWeight.Data(),
                          WeightScale.Data(),
                          WeightZero.Data(),
                          WeightSum.Data(),
                          0.0,
                          out_data,
                          l);
    }
 };
 template <typename T>
 void AvxWeightOnly(const paddle::Tensor &x,
                   const paddle::Tensor &weight,
                   const paddle::Tensor &w_bias,
                   bool trans,
                   const std::string alog,
                   paddle::Tensor &out) {
    static std::unordered_map<std::string,
                              std::tuple<xft::Matrix<T> *,
                                         xft::Vector<float> *,
                                         xft::Vector<float> *,
                                         xft::Vector<float> *>>
        weight_only_hub;
    std::stringstream weights_addr;
    weights_addr << weight.data<float>() << alog;
    std::string weight_only_key = weights_addr.str();
    auto it_created = weight_only_hub.find(weight_only_key);
    static MMHelper *mmHelper;
    int rows = weight.shape()[0], cols = weight.shape()[1];
    xft::Vector<float> *WeightScale =
        new xft::Vector<float>();  // if weight is int8
    xft::Vector<float> *WeightZero =
        new xft::Vector<float>();  // if weight is int8
    xft::Vector<float> *WeightSum =
        new xft::Vector<float>();  // if weight is int8
    xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
    if (it_created == weight_only_hub.end()) {
        auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
        xft::Matrix<T> convertedWeight;
        mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
        mmHelper->convertWeight(trans,
                                rows,
                                cols,
                                weight_ptr,
                                nullptr,
                                nullptr,
                                convertedWeight,
                                *WeightScale,
                                *WeightZero,
                                *WeightSum);
        quantizedWeight->Resize(rows, cols);
        mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
        weight_only_hub[weight_only_key] = std::make_tuple(
            quantizedWeight, WeightScale, WeightZero, WeightSum);
        AvxCompute<T>(x,
                      weight,
                      w_bias,
                      trans,
                      alog,
                      out,
                      *quantizedWeight,
                      *WeightScale,
                      *WeightZero,
                      *WeightSum,
                      mmHelper);
    } else {
        AvxCompute<T>(x,
                      weight,
                      w_bias,
                      trans,
                      alog,
                      out,
                      *(std::get<0>(it_created->second)),
                      *(std::get<1>(it_created->second)),
                      *(std::get<2>(it_created->second)),
                      *(std::get<3>(it_created->second)),
                      mmHelper);
    }
 }
 std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
                                                const paddle::Tensor &weight,
                                                const paddle::Tensor &w_bias,
                                                const std::string &alog,
                                                bool trans) {
    auto out_shape = x.shape();
    out_shape[out_shape.size() - 1] = weight.shape()[1];
    auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
    if (alog == "int8") {
        AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
    } else if (alog == "fp16") {
        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
    } else {
        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
    }
    return {out};
 }
 std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
    std::vector<int64_t> x_shape,
    std::vector<int64_t> weigh_shape,
    std::vector<int64_t> weigh_bias_shape) {
    int m = 1;
    for (int i = 0; i < x_shape.size() - 1; i++) {
        m = m * x_shape[i];
    }
    return {std::vector<int64_t>{m, weigh_shape[1]}};
 }
 std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
    paddle::DataType x_dtype,
    paddle::DataType weight_dtype,
    paddle::DataType weight_bias_dtype) {
    return {x_dtype};
 }
 PD_BUILD_STATIC_OP(avx_weight_only)
    .Inputs({"x", "weight", "w_bias"})
    .Outputs({"out"})
    .Attrs({"alog: std::string", "trans:bool"})
    .SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
    .SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -0,0 +1,268 @@
 // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <vector>
 #include "paddle/extension.h"
 #ifndef PD_BUILD_STATIC_OP
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif
 template <typename T>
 void RebuildPaddingCPUImpl(T *output_data,
                           const T *input_data,
                           const int *cum_offsets_data,
                           const int *seq_len_this_time_data,
                           const int *seq_lens_decoder_data,
                           const int *seq_lens_encoder_data,
                           int max_input_length,
                           int dim_embed,
                           const int elem_nums) {
    for (int i = 0; i < elem_nums; ++i) {
        const int bi = i / dim_embed;
        const int bias_idx = i % dim_embed;
        int seq_id = 0;
        if (seq_len_this_time_data[bi] == 0) {
            continue;
        }
        if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
            continue;
        }
        if (seq_lens_encoder_data[bi] > 0) {
            seq_id = seq_lens_encoder_data[bi] - 1;
        }
        const int ori_token_idx =
            bi * max_input_length - cum_offsets_data[bi] + seq_id;
        const int src_offset = ori_token_idx * dim_embed + bias_idx;
        output_data[i] = input_data[src_offset];
    }
 }
 template <typename T>
 void RebuildAppendPaddingCPUImpl(T *output_data,
                                 const T *input_data,
                                 const int *cum_offsets_data,
                                 const int *seq_len_this_time_data,
                                 const int *seq_lens_decoder_data,
                                 const int *seq_lens_encoder_data,
                                 const int *output_padding_offset_data,
                                 const int max_input_length,
                                 const int dim_embed,
                                 const int64_t output_elem_nums) {
    for (int i = 0; i < output_elem_nums; ++i) {
        int out_token_id = i / dim_embed;
        int ori_token_id =
            out_token_id + output_padding_offset_data[out_token_id];
        int bi = ori_token_id / max_input_length;
        if (seq_len_this_time_data[bi] == 0 ||
            (seq_lens_decoder_data[bi] == 0 &&
             seq_lens_encoder_data[bi] == 0)) {
            continue;
        }
        int seq_id = 0;
        if (seq_lens_encoder_data[bi] > 0) {
            seq_id = seq_lens_encoder_data[bi] - 1;
        }
        int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
        int bias_idx = i % dim_embed;
        int src_offset = input_token_id * dim_embed + bias_idx;
        output_data[i] = input_data[src_offset];
    }
 }
 std::vector<paddle::Tensor> RebuildPaddingCPU(
    const paddle::Tensor &tmp_out,
    const paddle::Tensor &cum_offsets,
    const paddle::Tensor &seq_len_this_time,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_encoder,
    const paddle::optional<paddle::Tensor> &output_padding_offset,
    int max_input_length) {
    auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
    auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
    auto seq_len_this_time_cpu =
        seq_len_this_time.copy_to(paddle::CPUPlace(), true);
    auto seq_lens_decoder_cpu =
        seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
    auto seq_lens_encoder_cpu =
        seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
    paddle::optional<paddle::Tensor> output_padding_offset_cpu;
    if (output_padding_offset) {
        output_padding_offset_cpu =
            output_padding_offset->copy_to(paddle::CPUPlace(), true);
    }
    int token_num = tmp_out_cpu.shape()[0];
    int dim_embed = tmp_out_cpu.shape()[1];
    int bsz = cum_offsets_cpu.shape()[0];
    paddle::Tensor out;
    if (output_padding_offset_cpu) {
        int need_delete_token_num = 0;
        for (int i = 0; i < bsz; ++i) {
            if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
                need_delete_token_num +=
                    seq_lens_encoder_cpu.data<int>()[i] - 1;
            }
        }
        int output_token_num = token_num - need_delete_token_num;
        out = paddle::full({output_token_num, dim_embed},
                           0,
                           tmp_out_cpu.dtype(),
                           paddle::CPUPlace());
    } else {
        out = paddle::full(
            {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
    }
    const int *cum_offsets_data = cum_offsets_cpu.data<int>();
    const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
    const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
    const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
    int elem_nums = out.numel();
    if (output_padding_offset_cpu) {
        const int *output_padding_offset_data =
            output_padding_offset_cpu->data<int>();
        switch (tmp_out_cpu.dtype()) {
            case paddle::DataType::FLOAT32:
                RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
                                                   tmp_out_cpu.data<float>(),
                                                   cum_offsets_data,
                                                   seq_len_this_time_data,
                                                   seq_lens_decoder_data,
                                                   seq_lens_encoder_data,
                                                   output_padding_offset_data,
                                                   max_input_length,
                                                   dim_embed,
                                                   elem_nums);
                break;
            case paddle::DataType::FLOAT16:
                RebuildAppendPaddingCPUImpl<paddle::float16>(
                    out.data<paddle::float16>(),
                    tmp_out_cpu.data<paddle::float16>(),
                    cum_offsets_data,
                    seq_len_this_time_data,
                    seq_lens_decoder_data,
                    seq_lens_encoder_data,
                    output_padding_offset_data,
                    max_input_length,
                    dim_embed,
                    elem_nums);
                break;
            case paddle::DataType::BFLOAT16:
                RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
                    out.data<paddle::bfloat16>(),
                    tmp_out_cpu.data<paddle::bfloat16>(),
                    cum_offsets_data,
                    seq_len_this_time_data,
                    seq_lens_decoder_data,
                    seq_lens_encoder_data,
                    output_padding_offset_data,
                    max_input_length,
                    dim_embed,
                    elem_nums);
                break;
            default:
                PD_THROW(
                    "Unsupported data type for rebuild_padding_cpu. "
                    "Only float32, float16, and bfloat16 are supported.");
        }
    } else {
        switch (tmp_out_cpu.dtype()) {
            case paddle::DataType::FLOAT32:
                RebuildPaddingCPUImpl<float>(out.data<float>(),
                                             tmp_out_cpu.data<float>(),
                                             cum_offsets_data,
                                             seq_len_this_time_data,
                                             seq_lens_decoder_data,
                                             seq_lens_encoder_data,
                                             max_input_length,
                                             dim_embed,
                                             elem_nums);
                break;
            case paddle::DataType::FLOAT16:
                RebuildPaddingCPUImpl<paddle::float16>(
                    out.data<paddle::float16>(),
                    tmp_out_cpu.data<paddle::float16>(),
                    cum_offsets_data,
                    seq_len_this_time_data,
                    seq_lens_decoder_data,
                    seq_lens_encoder_data,
                    max_input_length,
                    dim_embed,
                    elem_nums);
                break;
            case paddle::DataType::BFLOAT16:
                RebuildPaddingCPUImpl<paddle::bfloat16>(
                    out.data<paddle::bfloat16>(),
                    tmp_out_cpu.data<paddle::bfloat16>(),
                    cum_offsets_data,
                    seq_len_this_time_data,
                    seq_lens_decoder_data,
                    seq_lens_encoder_data,
                    max_input_length,
                    dim_embed,
                    elem_nums);
                break;
            default:
                PD_THROW(
                    "Unsupported data type for rebuild_padding_cpu. "
                    "Only float32, float16, and bfloat16 are supported.");
        }
    }
    return {out};
 }
 std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
    const std::vector<int64_t> &tmp_out_shape,
    const std::vector<int64_t> &cum_offsets_shape,
    const std::vector<int64_t> &seq_len_this_time_shape,
    const std::vector<int64_t> &seq_lens_decoder_shape,
    const std::vector<int64_t> &seq_lens_encoder_shape,
    const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
    int64_t dim_embed = tmp_out_shape[1];
    if (output_padding_offset_shape) {
        return {{-1, dim_embed}};
    } else {
        int64_t bsz = cum_offsets_shape[0];
        return {{bsz, dim_embed}};
    }
 }
 std::vector<paddle::DataType> RebuildPaddingInferDtype(
    const paddle::DataType &tmp_out_dtype,
    const paddle::DataType &cum_offsets_dtype,
    const paddle::DataType &seq_len_this_time_dtype,
    const paddle::DataType &seq_lens_decoder_dtype,
    const paddle::DataType &seq_lens_encoder_dtype,
    const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
    return {tmp_out_dtype};
 }
 PD_BUILD_STATIC_OP(rebuild_padding_cpu)
    .Inputs({"tmp_out",
             "cum_offsets",
             "seq_len_this_time",
             "seq_lens_decoder",
             "seq_lens_encoder",
             paddle::Optional("output_padding_offset")})
    .Outputs({"out"})
    .Attrs({"max_input_length: int"})
    .SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
    .SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));
--- a/custom_ops/cpu_ops/xft_all_layer.cc
+++ b/custom_ops/cpu_ops/xft_all_layer.cc
@@ -1,201 +0,0 @@
 // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "layers_decoder.h"
 #include "paddle/extension.h"
 #include "paddle/phi/core/kernel_registry.h"
 std::vector<paddle::Tensor> InvokeAllLLaMALayer(
    const paddle::Tensor &input,
    const std::vector<paddle::Tensor> &ln1Gamma,
    const std::vector<paddle::Tensor> &ln1Beta,
    const std::vector<paddle::Tensor> &qkvWeight,
    const std::vector<paddle::Tensor> &qkvBiasWeight,
    const std::vector<paddle::Tensor> &attnOutWeight,
    const std::vector<paddle::Tensor> &attnOutBias,
    const std::vector<paddle::Tensor> &ln2Gamma,
    const std::vector<paddle::Tensor> &ln2Beta,
    const std::vector<paddle::Tensor> &gateWeight,
    const std::vector<paddle::Tensor> &gateBias,
    const std::vector<paddle::Tensor> &upWeight,
    const std::vector<paddle::Tensor> &upBias,
    const std::vector<paddle::Tensor> &downWeight,
    const std::vector<paddle::Tensor> &downBias,
    const paddle::Tensor &pastSeqLen,
    const paddle::Tensor &currentSeqLen,
    const paddle::Tensor &step,
    int hiddensize,
    int totalLayer,
    const std::string &computeType,
    const std::string &activation,
    const std::string &normType,
    int attHeadDim,
    int attHeadNum,
    int kvHeadNum,
    int maxPositions,
    int maxPosEmbed,
    int intermediateSize) {
    auto out = paddle::empty_like(input);
    auto batchSize = input.shape()[0];
    auto inputSeqLen = input.shape()[1];
    auto past_seq_len = pastSeqLen.data<int64_t>()[0];
    auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
    auto step_id = step.data<int64_t>()[0];
    auto output_ptr = reinterpret_cast<void *>(out.data<float>());
    auto xft_data_type = xft::DataType::fp16;
    if (computeType == "bf16") {
        xft_data_type = xft::DataType::bf16;
    } else if (computeType == "bf16_int8") {
        xft_data_type = xft::DataType::bf16_int8;
    }
    auto xft_act_type = xft::ActivationType::SILU;
    if (activation == "relu") {
        xft_act_type = xft::ActivationType::RELU;
    } else if (activation == "gelu") {
        xft_act_type = xft::ActivationType::GELU;
    } else if (activation == "swiglu") {
        xft_act_type = xft::ActivationType::SWIGLU;
    }
    auto xft_norm_type = xft::NormType::RMS;
    if (normType == "layernorm") {
        xft_norm_type = xft::NormType::LN;
    }
    auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
    for (int i = 0; i < totalLayer; ++i) {
        auto ln1Gamma_ptr =
            reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
        auto ln1Beta_ptr =
            reinterpret_cast<const float *>(ln1Beta[i].data<float>());
        auto qkvWeight_ptr =
            reinterpret_cast<const void *>(qkvWeight[i].data<float>());
        auto qkvBiasWeight_ptr =
            reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
        auto attnOutWeight_ptr =
            reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
        auto ln2Gamma_ptr =
            reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
        auto ln2Beta_ptr =
            reinterpret_cast<const float *>(ln2Beta[i].data<float>());
        auto gate_weight_ptr =
            reinterpret_cast<const void *>(gateWeight[i].data<float>());
        auto up_weight_ptr =
            reinterpret_cast<const void *>(upWeight[i].data<float>());
        auto down_weight_ptr =
            reinterpret_cast<const void *>(downWeight[i].data<float>());
        auto gate_bias_ptr =
            reinterpret_cast<const float *>(gateBias[i].data<float>());
        auto up_bias_ptr =
            reinterpret_cast<const float *>(upBias[i].data<float>());
        auto down_bias_ptr =
            reinterpret_cast<const float *>(downBias[i].data<float>());
        auto attnOutBias_ptr =
            reinterpret_cast<const float *>(attnOutBias[i].data<float>());
        invokeLayerLLaMA(
            xft_data_type,                         // dt
            xft_act_type,                          // at
            xft_norm_type,                         // nt
            i,                                     // layerId
            totalLayer,                            // totalLayers
            batchSize,                             // batchSize
            inputSeqLen,                           // inputSeqLen
            attHeadDim,                            // attHeadDim
            attHeadNum,                            // attHeadNum
            kvHeadNum,                             // kvHeadNum
            maxPositions,                          // maxPositions
            maxPosEmbed,                           // maxPosEmbed
            past_seq_len,                          // pastSeqLen
            cur_seq_len,                           // currentSeqLen
            step_id,                               // step
            hiddensize,                            // hiddenSize
            intermediateSize,                      // intermediateSize
            reinterpret_cast<void *>(output_ptr),  // output
            hiddensize,                            // outputStride
            input_ptr,                             // input
            hiddensize,                            // inputStride
            ln1Gamma_ptr,                          // ln1Gamma
            ln1Beta_ptr,                           // ln1Beta
            qkvWeight_ptr,                         // queryWeight
            qkvWeight_ptr + hiddensize,            // keyWeight
            qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim,  // valueWeight
            attnOutWeight_ptr,  // attnOutWeight
            ln2Gamma_ptr,       // ln2Gamma
            ln2Beta_ptr,        // ln2Beta
            gate_weight_ptr,
            up_weight_ptr,
            down_weight_ptr,
            qkvBiasWeight_ptr,               // queryBias
            qkvBiasWeight_ptr + hiddensize,  // keyBias
            qkvBiasWeight_ptr + hiddensize +
                kvHeadNum * attHeadDim,  // valueBias
            attnOutBias_ptr,             // attnOutBias
            qkvWeight_ptr,               // myqkvWeight
            gate_bias_ptr,
            up_bias_ptr,
            down_bias_ptr,
            qkvBiasWeight_ptr);
        if (i < totalLayer - 1) {
            memcpy(const_cast<void *>(input_ptr),
                   output_ptr,
                   batchSize * inputSeqLen * hiddensize * sizeof(float));
        }
    }
    return {out};
 }
 std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
    std::vector<int64_t> x_shape) {
    return {x_shape};
 }
 std::vector<paddle::DataType> AllLLaMALayerInferDtype(
    paddle::DataType x_dtype) {
    return {x_dtype};
 }
 PD_BUILD_STATIC_OP(xft_llama_all_layer)
    .Inputs({
        "x",
        paddle::Vec("ln1Gamma"),
        paddle::Vec("ln1Beta"),
        paddle::Vec("qkvWeight"),
        paddle::Vec("qkvBiasWeight"),
        paddle::Vec("attnOutWeight"),
        paddle::Vec("attnOutBias"),
        paddle::Vec("ln2Gamma"),
        paddle::Vec("ln2Beta"),
        paddle::Vec("gateWeight"),
        paddle::Vec("gateBias"),
        paddle::Vec("upWeight"),
        paddle::Vec("upBias"),
        paddle::Vec("downWeight"),
        paddle::Vec("downBias"),
        "pastSeqLen",
        "currentSeqLen",
        "step",
    })
    .Outputs({"out"})
    .Attrs({"hiddensize :int",
            "totalLayer :int",
            "computeType : std::string",
            "activation :std::string",
            "normType :std::string",
            "attHeadDim: int",
            "attHeadNum: int",
            "kvHeadNum: int",
            "maxPositions: int",
            "maxPosEmbed: int",
            "intermediateSize: int"})
    .SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
    .SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));
--- a/custom_ops/cpu_ops/xft_greedy_search.cc
+++ b/custom_ops/cpu_ops/xft_greedy_search.cc
@@ -1,126 +0,0 @@
 // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <omp.h>
 #include <cstdio>
 #include <iostream>
 #include "paddle/extension.h"
 void greedy_search(const float *probs,
                   int64_t *next_token_ids,
                   int bsz,
                   int vocab_size) {
    int numThreads = 0;
 #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        if (tid == 0) {
            numThreads = omp_get_num_threads();
        }
    }
    float maxVals[bsz];
    // Small batch size (each sample can have at least 2 threads)
    if (numThreads / bsz >= 2) {
        int thrPerSample = numThreads / bsz;
        int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
        int maxIndices[bsz * thrPerSample];
        float maxValues[bsz * thrPerSample];
        // TODO: if size is small, possible to cause out of boundary
 #pragma omp parallel for collapse(2)
        for (int b = 0; b < bsz; ++b) {
            for (int t = 0; t < thrPerSample; ++t) {
                int start = t * sizePerThr;
                int end = (start + sizePerThr) > vocab_size
                              ? vocab_size
                              : (start + sizePerThr);
                const float *p = probs + b * vocab_size;
                int maxIdx = start;
                float maxVal = p[start];
                for (int off = start + 1; off < end; ++off) {
                    if (p[off] > maxVal) {
                        maxVal = p[off];
                        maxIdx = off;
                    }
                }
                // False sharing happens, but since only one time, not avoided
                maxIndices[b * thrPerSample + t] = maxIdx;
                maxValues[b * thrPerSample + t] = maxVal;
            }
        }
        // Local reduction
        for (int i = 0; i < bsz; ++i) {
            int *pIndices = maxIndices + i * thrPerSample;
            float *pValues = maxValues + i * thrPerSample;
            int maxIdx = pIndices[0];
            float maxVal = pValues[0];
            for (int j = 1; j < thrPerSample; ++j) {
                if (pValues[j] > maxVal) {
                    maxVal = pValues[j];
                    maxIdx = pIndices[j];
                }
            }
            next_token_ids[i] = maxIdx;
            maxVals[i] = maxVal;
        }
    }
    // Each thread handle one sample (one row)
    else {
 #pragma omp parallel for
        for (int i = 0; i < bsz; ++i) {
            int maxId = 0;
            const float *p = probs + i * vocab_size;
            float maxVal = p[0];
            for (int j = 1; j < vocab_size; ++j) {
                if (p[j] > maxVal) {
                    maxVal = p[j];
                    maxId = j;
                }
            }
            next_token_ids[i] = maxId;
            maxVals[i] = maxVal;
        }
    }
    return;
 }
 std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
    const int bsz = probs.shape()[0];
    const int vocab_size = probs.shape()[1];
    auto next_tokens =
        paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
    greedy_search(probs.data<float>(),
                  const_cast<int64_t *>(next_tokens.data<int64_t>()),
                  bsz,
                  vocab_size);
    return {next_tokens};
 }
 std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
    const std::vector<int64_t> &probs_shape) {
    int64_t bsz = probs_shape[0];
    return {{bsz, 1}};
 }
 std::vector<paddle::DataType> XftGreedySearchInferDtype(
    const paddle::DataType &probs_dtype) {
    return {paddle::DataType::INT64};
 }
 PD_BUILD_STATIC_OP(xft_greedy_search)
    .Inputs({"probs"})
    .Outputs({"next_tokens_ids"})
    .SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
    .SetKernelFn(PD_KERNEL(XftGreedySearch));
--- a/custom_ops/gpu_ops/air_topp_sampling.cu
+++ b/custom_ops/gpu_ops/air_topp_sampling.cu
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -17,15 +17,12 @@
 #include "paddle/phi/core/memory/memcpy.h"
 template <int THREADBLOCK_SIZE>
-__global__ void GetMaxLenKernel(const int *seq_lens,
+__global__ void
-                                const int *seq_lens_this_time,
+GetMaxLenKernel(const int *seq_lens, const int *seq_lens_this_time,
-                                const int *seq_lens_encoder,
+                const int *seq_lens_encoder,
-                                const int *seq_lens_this_time_merged,
+                const int *seq_lens_this_time_merged,
-                                const int *seq_lens_encoder_merged,
+                const int *seq_lens_encoder_merged, const int *seq_mapping,
-                                const int *seq_mapping,
+                const int *system_lens, int *max_lens, const int batch_size) {
                                const int *system_lens,
                                int *max_lens,
                                const int batch_size) {
  const int tid = threadIdx.x;
  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
@@ -41,43 +38,61 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
  int max_dec_len_without_system_this_thread = 0;
  for (int i = tid; i < batch_size; i += blockDim.x) {
    const int seq_len_this_time = seq_lens_this_time[i];
-    max_len_this_time_this_thread = max(seq_len_this_time,
+    max_len_this_time_this_thread =
-                                        max_len_this_time_this_thread);
+        max(seq_len_this_time, max_len_this_time_this_thread);
-    max_len_encoder_this_thread = max(seq_lens_encoder[i],
+    max_len_encoder_this_thread =
-                                      max_len_encoder_this_thread);
+        max(seq_lens_encoder[i], max_len_encoder_this_thread);
    max_len_decoder_this_thread = max(seq_lens[i], max_len_decoder_this_thread);
-    if (seq_len_this_time <= 0) continue;
+    if (seq_len_this_time <= 0)
      continue;
    const int max_just_dec_len_now = seq_lens_encoder[i] > 0 ? 0 : seq_lens[i];
-    max_len_this_thread = max(seq_lens[i] + seq_len_this_time,
+    max_len_this_thread =
-                              max_len_this_thread);
+        max(seq_lens[i] + seq_len_this_time, max_len_this_thread);
-    max_just_dec_len_this_thread = max(max_just_dec_len_this_thread,
+    max_just_dec_len_this_thread =
-                                       max_just_dec_len_now);
+        max(max_just_dec_len_this_thread, max_just_dec_len_now);
    if (system_lens) {
      const int real_bid = seq_mapping[i];
      const int system_len_now = system_lens[real_bid];
-      max_system_len_this_thread = max(max_system_len_this_thread, system_len_now);
+      max_system_len_this_thread =
-      max_dec_len_without_system_this_thread = max(max_dec_len_without_system_this_thread,
+          max(max_system_len_this_thread, system_len_now);
-                                                   max_just_dec_len_now - system_len_now);
+      max_dec_len_without_system_this_thread =
          max(max_dec_len_without_system_this_thread,
              max_just_dec_len_now - system_len_now);
    }
  }
  if (system_lens) {
    for (int i = tid; i < batch_size; i += blockDim.x) {
      const int ori_seq_len_this_time = seq_lens_this_time_merged[i];
-      if (ori_seq_len_this_time <= 0) continue;
+      if (ori_seq_len_this_time <= 0)
-      const int max_just_dec_merged_len_this_time_now = seq_lens_encoder_merged[i] > 0 ?
+        continue;
-                                                        0 : ori_seq_len_this_time;
+      const int max_just_dec_merged_len_this_time_now =
-      max_just_dec_merged_len_this_time_this_thread = max(max_just_dec_merged_len_this_time_this_thread,
+          seq_lens_encoder_merged[i] > 0 ? 0 : ori_seq_len_this_time;
-                                                          max_just_dec_merged_len_this_time_now);
+      max_just_dec_merged_len_this_time_this_thread =
          max(max_just_dec_merged_len_this_time_this_thread,
              max_just_dec_merged_len_this_time_now);
    }
  }
-  int total_max_len_this_time = BlockReduce(temp_storage).Reduce(max_len_this_time_this_thread, MaxOp<int>());
+  int total_max_len_this_time =
-  int total_max_len_encoder = BlockReduce(temp_storage).Reduce(max_len_encoder_this_thread, MaxOp<int>());
+      BlockReduce(temp_storage)
-  int total_max_len_decoder = BlockReduce(temp_storage).Reduce(max_len_decoder_this_thread, MaxOp<int>());
+          .Reduce(max_len_this_time_this_thread, MaxOp<int>());
-  int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
+  int total_max_len_encoder =
-  int total_just_dec = BlockReduce(temp_storage).Reduce(max_just_dec_len_this_thread, MaxOp<int>());
+      BlockReduce(temp_storage)
-  int total_just_dec_merged = BlockReduce(temp_storage).Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
+          .Reduce(max_len_encoder_this_thread, MaxOp<int>());
-  int total_system_len = BlockReduce(temp_storage).Reduce(max_system_len_this_thread, MaxOp<int>());
+  int total_max_len_decoder =
-  int total_dec_len_without_system = BlockReduce(temp_storage).Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
+      BlockReduce(temp_storage)
          .Reduce(max_len_decoder_this_thread, MaxOp<int>());
  int total =
      BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
  int total_just_dec = BlockReduce(temp_storage)
                           .Reduce(max_just_dec_len_this_thread, MaxOp<int>());
  int total_just_dec_merged =
      BlockReduce(temp_storage)
          .Reduce(max_just_dec_merged_len_this_time_this_thread, MaxOp<int>());
  int total_system_len = BlockReduce(temp_storage)
                             .Reduce(max_system_len_this_thread, MaxOp<int>());
  int total_dec_len_without_system =
      BlockReduce(temp_storage)
          .Reduce(max_dec_len_without_system_this_thread, MaxOp<int>());
  if (tid == 0) {
    max_lens[0] = total_max_len_this_time;
    max_lens[1] = total_max_len_encoder;
@@ -90,30 +105,22 @@ __global__ void GetMaxLenKernel(const int *seq_lens,
  }
 }
-void GetMaxLen(const paddle::Tensor& seq_lens_tensor,
+void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
-              const paddle::Tensor& seq_lens_this_time,
+               const paddle::Tensor &seq_lens_this_time,
-              const paddle::Tensor& seq_lens_encoder,
+               const paddle::Tensor &seq_lens_encoder,
-              paddle::Tensor &max_len_tensor,
+               paddle::Tensor &max_len_tensor, const int batch_size) {
              const int batch_size) {
  constexpr int blockSize = 1024;
  GetMaxLenKernel<blockSize><<<1, blockSize, 0, seq_lens_encoder.stream()>>>(
-    seq_lens_tensor.data<int>(),
+      seq_lens_tensor.data<int>(), seq_lens_this_time.data<int>(),
-    seq_lens_this_time.data<int>(),
+      seq_lens_encoder.data<int>(), nullptr, nullptr, nullptr, nullptr,
-    seq_lens_encoder.data<int>(),
+      max_len_tensor.data<int>(), batch_size);
    nullptr,
    nullptr,
    nullptr,
    nullptr,
    max_len_tensor.data<int>(),
    batch_size);
 }
-__global__ void split_q_block(const int* __restrict__ seq_lens_q,
+__global__ void split_q_block(const int *__restrict__ seq_lens_q,
-                              const int* __restrict__ seq_lens_encoder,
+                              const int *__restrict__ seq_lens_encoder,
-                              int* __restrict__ batch_ids,
+                              int *__restrict__ batch_ids,
-                              int* __restrict__ tile_ids_per_batch,
+                              int *__restrict__ tile_ids_per_batch,
-                              int* __restrict__ num_blocks_x,
+                              int *__restrict__ num_blocks_x, const int bsz,
                              const int bsz,
                              const int num_rows_per_block,
                              const int group_size) {
  if (threadIdx.x == 0) {
@@ -124,8 +131,7 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
      if (seq_lens_encoder && seq_lens_encoder[bid] > 0) {
        seq_len = 0;
      }
-      const int loop_times =
+      const int loop_times = div_up(seq_len * group_size, num_rows_per_block);
          div_up(seq_len * group_size, num_rows_per_block);
      for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
        batch_ids[index] = bid;
        tile_ids_per_batch[index++] = tile_id;
@@ -136,14 +142,12 @@ __global__ void split_q_block(const int* __restrict__ seq_lens_q,
  }
 }
-__global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
+__global__ void split_kv_block(const int *__restrict__ seq_lens_decoder,
-                               const int* __restrict__ seq_lens_encoder,
+                               const int *__restrict__ seq_lens_encoder,
-                               int* __restrict__ batch_ids,
+                               int *__restrict__ batch_ids,
-                               int* __restrict__ tile_ids_per_batch,
+                               int *__restrict__ tile_ids_per_batch,
-                               int* __restrict__ num_blocks_x,
+                               int *__restrict__ num_blocks_x, const int bsz,
-                               const int bsz,
+                               const int pad_len, const int num_row_per_block) {
                               const int pad_len,
                               const int num_row_per_block) {
  if (threadIdx.x == 0) {
    int gridx = 0;
    int index = 0;
@@ -165,50 +169,46 @@ __global__ void split_kv_block(const int* __restrict__ seq_lens_decoder,
 }
 template <int THREADBLOCK_SIZE>
-__global__ void get_max_len_kv_ernel(int* max_seq_lens_out,
+__global__ void
-                                  const int* seq_lens_this_time,
+get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
-                                  const int* seq_lens_decoder,
+                     const int *seq_lens_decoder, const int batch_size) {
                                  const int batch_size) {
  const int tid = threadIdx.x;
  typedef cub::BlockReduce<int, THREADBLOCK_SIZE> BlockReduce;
  __shared__ typename BlockReduce::TempStorage temp_storage;
  int max_len_this_thread = 0;
  for (int i = tid; i < batch_size; i += blockDim.x) {
-    if (seq_lens_decoder[i] == 0) continue;
+    if (seq_lens_decoder[i] == 0)
-    max_len_this_thread = max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
+      continue;
    max_len_this_thread =
        max(seq_lens_this_time[i] + seq_lens_decoder[i], max_len_this_thread);
  }
-  int total = BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
+  int total =
      BlockReduce(temp_storage).Reduce(max_len_this_thread, MaxOp<int>());
  if (tid == 0) {
    *max_seq_lens_out = total;
  }
 }
 std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
-    const paddle::Tensor& seq_lens_encoder,
+    const paddle::Tensor &seq_lens_encoder,
-    const paddle::Tensor& seq_lens_decoder,
+    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor& seq_lens_this_time,
+    const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
-    const paddle::Tensor& cum_offsets,
+    const int encoder_block_shape_q, const int decoder_block_shape_q,
-    const int encoder_block_shape_q,
+    const int group_size, const int block_size,
    const int decoder_block_shape_q,
    const int group_size,
    const int block_size,
    const int decoder_step_token_num) {
  auto stream = seq_lens_encoder.stream();
  int bsz = cum_offsets.shape()[0];
  auto max_len_tensor =
      GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
-  GetMaxLen(
+  GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
-    seq_lens_decoder,
+            max_len_tensor, bsz);
    seq_lens_this_time,
    seq_lens_encoder,
    max_len_tensor,
    bsz);
-  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time, max_enc_dec_len_this_time,
+  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
-  // max_just_dec_len_this_time, max_just_dec_merged_len_this_time, max_system_len, max_just_dec_len_without_system
+  // max_enc_dec_len_this_time, max_just_dec_len_this_time,
  // max_just_dec_merged_len_this_time, max_system_len,
  // max_just_dec_len_without_system
  auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
  auto max_len_cpu_ptr = max_len_cpu.data<int>();
  int max_len_this_time = max_len_cpu_ptr[0];
@@ -229,67 +229,67 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
  paddle::Tensor decoder_batch_ids;
  paddle::Tensor decoder_tile_ids_per_batch;
  paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor max_len_kv_cpu; /*cpu*/
+  paddle::Tensor max_len_kv_cpu;           /*cpu*/
  auto max_len_kv =
      GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
  get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
-    max_len_kv.data<int>(),
+      max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
-    seq_lens_this_time.data<int>(),
+      seq_lens_decoder.data<int>(), bsz);
    seq_lens_decoder.data<int>(),
    bsz
  );
-  max_len_kv_cpu =
+  max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
      max_len_kv.copy_to(paddle::CPUPlace(), false);
  if (max_enc_len_this_time > 0) {
-    const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
+    const uint32_t max_tile_size_per_bs_kv =
-    kv_batch_ids = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
+        div_up(max_enc_dec_len_this_time, block_size);
-                                      paddle::DataType::INT32,
+    kv_batch_ids =
-                                      seq_lens_encoder.place());
+        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
-    kv_tile_ids_per_batch = GetEmptyTensor({bsz * max_tile_size_per_bs_kv},
+                       seq_lens_encoder.place());
-                                                paddle::DataType::INT32,
+    kv_tile_ids_per_batch =
-                                                seq_lens_encoder.place());
+        GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
                       seq_lens_encoder.place());
    auto kv_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
    split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
-      seq_lens_decoder.data<int>(),
+        seq_lens_decoder.data<int>(),
-      // sequence_lengths->data<int>(),
+        // sequence_lengths->data<int>(),
-      seq_lens_encoder.data<int>(),
+        seq_lens_encoder.data<int>(), kv_batch_ids.data<int>(),
-      kv_batch_ids.data<int>(),
+        kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
-      kv_tile_ids_per_batch.data<int>(),
+        block_size, block_size);
      kv_num_blocks_x.data<int>(),
      bsz,
      block_size,
      block_size
    );
    kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
-    const uint32_t encoder_max_tile_size_per_bs_q = div_up(
+    const uint32_t encoder_max_tile_size_per_bs_q =
-        (max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
+        div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
    encoder_batch_ids =
        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
+                       paddle::DataType::INT32, seq_lens_encoder.place());
                      seq_lens_encoder.place());
    encoder_tile_ids_per_batch =
        GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
+                       paddle::DataType::INT32, seq_lens_encoder.place());
                      seq_lens_encoder.place());
    auto encoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
-    split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
+    split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
                                        nullptr,
                                        encoder_batch_ids.data<int>(),
                                        encoder_tile_ids_per_batch.data<int>(),
-                                        encoder_num_blocks_x.data<int>(),
+                                        encoder_num_blocks_x.data<int>(), bsz,
-                                        bsz,
+                                        encoder_block_shape_q, group_size);
                                        encoder_block_shape_q,
                                        group_size);
    encoder_num_blocks_x_cpu =
        encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
  } else {
    encoder_batch_ids =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    encoder_tile_ids_per_batch =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    encoder_num_blocks_x_cpu =
        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
    kv_batch_ids =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    kv_tile_ids_per_batch =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    kv_num_blocks_x_cpu =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
  }
  if (max_just_dec_len_this_time > 0) {
    const uint32_t decoder_max_tile_size_per_bs_q =
@@ -297,24 +297,26 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    decoder_batch_ids =
        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
+                       paddle::DataType::INT32, seq_lens_encoder.place());
                      seq_lens_encoder.place());
    decoder_tile_ids_per_batch =
        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                      paddle::DataType::INT32,
+                       paddle::DataType::INT32, seq_lens_encoder.place());
                      seq_lens_encoder.place());
    auto decoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
-    split_q_block<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
+    split_q_block<<<1, 32, 0, stream>>>(
-                                        seq_lens_encoder.data<int>(),
+        seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
-                                        decoder_batch_ids.data<int>(),
+        decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
-                                        decoder_tile_ids_per_batch.data<int>(),
+        decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
-                                        decoder_num_blocks_x.data<int>(),
+        group_size);
                                        bsz,
                                        decoder_block_shape_q,
                                        group_size);
    decoder_num_blocks_x_cpu =
        decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
  } else {
    decoder_batch_ids =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    decoder_tile_ids_per_batch =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
    decoder_num_blocks_x_cpu =
        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
  }
  return {encoder_batch_ids,
@@ -331,28 +333,22 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
 }
 std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
-    const paddle::DataType& seq_lens_encoder_dtype,
+    const paddle::DataType &seq_lens_encoder_dtype,
-    const paddle::DataType& seq_lens_decoder_dtype,
+    const paddle::DataType &seq_lens_decoder_dtype,
-    const paddle::DataType& seq_lens_this_time_dtype,
+    const paddle::DataType &seq_lens_this_time_dtype,
-    const paddle::DataType& cum_offsets_dtype) {
+    const paddle::DataType &cum_offsets_dtype) {
-  return {paddle::DataType::INT32,
+  return {
-          paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-          paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-          paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-          paddle::DataType::INT32,
+      paddle::DataType::INT32, paddle::DataType::INT32};
          paddle::DataType::INT32,
          paddle::DataType::INT32,
          paddle::DataType::INT32,
          paddle::DataType::INT32,
          paddle::DataType::INT32,
          paddle::DataType::INT32};
 }
 std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
-    const std::vector<int64_t>& seq_lens_encoder_shape,
+    const std::vector<int64_t> &seq_lens_encoder_shape,
-    const std::vector<int64_t>& seq_lens_decoder_shape,
+    const std::vector<int64_t> &seq_lens_decoder_shape,
-    const std::vector<int64_t>& seq_lens_this_time_shape,
+    const std::vector<int64_t> &seq_lens_this_time_shape,
-    const std::vector<int64_t>& cum_offsets_shape) {
+    const std::vector<int64_t> &cum_offsets_shape) {
  std::vector<int64_t> dynamic_shape = {-1};
  return {dynamic_shape,
@@ -369,9 +365,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
 }
 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
-    .Inputs({"seq_lens_encoder",
+    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
             "seq_lens_decoder",
             "seq_lens_this_time",
             "cum_offsets"})
    .Outputs({paddle::Optional("encoder_batch_ids"),
              paddle::Optional("encoder_tile_ids_per_batch"),
@@ -382,12 +376,9 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
              paddle::Optional("decoder_batch_ids"),
              paddle::Optional("decoder_tile_ids_per_batch"),
              paddle::Optional("decoder_num_blocks"),
-              paddle::Optional("max_len_kv"),
+              paddle::Optional("max_len_kv"), "set_max_lengths"})
-              "set_max_lengths"})
+    .Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
-    .Attrs({"encoder_block_shape_q: int",
+            "group_size: int", "block_size: int",
            "decoder_block_shape_q: int",
            "group_size: int",
            "block_size: int",
            "decoder_step_token_num: int"})
    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
    .SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
--- a/custom_ops/gpu_ops/append_attn/utils.cuh
+++ b/custom_ops/gpu_ops/append_attn/utils.cuh
@@ -337,6 +337,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (deal_each_time == 64) {                                 \
    constexpr size_t DEAL_EACH_TIME = 64;                            \
    __VA_ARGS__                                                      \
  } else {                                                           \
    PD_THROW("not support the deal_each_time", deal_each_time);      \
  }
 #define DISPATCH_NUM_THREADS(num_threads, NUM_THREADS, ...) \
@@ -346,6 +348,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (num_threads == 256) {                          \
    constexpr size_t NUM_THREADS = 256;                     \
    __VA_ARGS__                                             \
   } else {                                                 \
    PD_THROW("not support the num_threads", num_threads);   \
  }
 #define DISPATCH_GQA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
@@ -376,6 +380,11 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
  } else if (group_size == 12) {                             \
    constexpr size_t GROUP_SIZE = 12;                        \
    __VA_ARGS__                                              \
  } else if (group_size == 16) {                             \
    constexpr size_t GROUP_SIZE = 16;                        \
    __VA_ARGS__                                              \
  } else {                                                   \
    PD_THROW("not support the group_size", group_size);      \
  }
 #define DISPATCH_BLOCKSHAPE_Q(block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, ...) \
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include "paddle/extension.h"
-
+#include "pybind11/pybind11.h"
 namespace py = pybind11;
 // 自定义异常类，用于处理CUDA错误
@@ -125,45 +125,40 @@ paddle::Tensor FusedExpertMoeFunc(
    const bool norm_topk_prob, const bool group_moe);
 std::vector<paddle::Tensor> MoeExpertDispatch(
-    const paddle::Tensor& input,
+    const paddle::Tensor &input, const paddle::Tensor &gating_output,
-    const paddle::Tensor& gating_output,
+    const paddle::optional<paddle::Tensor> &gating_correction_bias,
-    const paddle::optional<paddle::Tensor>& gating_correction_bias,
+    const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
-    const paddle::optional<paddle::Tensor> &w4a8_in_scale,
+    const bool group_moe, const bool topk_only_mode);
    const int moe_topk,
    const bool group_moe,
    const bool topk_only_mode);
 std::vector<paddle::Tensor>
 MoETopKSelectKernel(const paddle::Tensor &gating_logits,
-                  const paddle::optional<paddle::Tensor> &bias,
+                    const paddle::optional<paddle::Tensor> &bias,
-                  const int moe_topk, const bool apply_norm_weight,
+                    const int moe_topk, const bool apply_norm_weight,
-                  const bool enable_softmax_top_k_fused);
+                    const bool enable_softmax_top_k_fused);
-std::vector<paddle::Tensor> MoERedundantTopKSelectKernel(
+std::vector<paddle::Tensor>
-    const paddle::Tensor& gating_logits,
+MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
-    const paddle::Tensor& expert_id_to_ep_rank_array,
+                             const paddle::Tensor &expert_id_to_ep_rank_array,
-    const paddle::Tensor& expert_in_rank_num_list,
+                             const paddle::Tensor &expert_in_rank_num_list,
-    paddle::Tensor& tokens_per_expert_stats_list,
+                             paddle::Tensor &tokens_per_expert_stats_list,
-    const paddle::optional<paddle::Tensor>& bias,
+                             const paddle::optional<paddle::Tensor> &bias,
-    const int moe_topk,
+                             const int moe_topk, const bool apply_norm_weight,
-    const bool apply_norm_weight,
+                             const bool enable_softmax_top_k_fused,
-    const bool enable_softmax_top_k_fused,
+                             const int redundant_ep_rank_num_plus_one);
    const int redundant_ep_rank_num_plus_one);
 std::vector<paddle::Tensor>
 EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
-                  const paddle::Tensor &topk_weights,
+                    const paddle::Tensor &topk_weights,
-                  const paddle::optional<paddle::Tensor> &ffn1_in_scale,
+                    const paddle::optional<paddle::Tensor> &ffn1_in_scale,
-                  const std::vector<int> &token_nums_per_expert,
+                    const std::vector<int> &token_nums_per_expert,
-                  const int token_nums_this_rank,
+                    const int token_nums_this_rank,
-                  const std::string &moe_quant_type);
+                    const std::string &moe_quant_type);
 std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
    const paddle::Tensor &input, const paddle::Tensor &scale,
    const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
-    const std::vector<int> &token_nums_per_expert,
+    const paddle::Tensor &token_nums_per_expert,
-    const std::vector<int> &token_nums_per_expert_padded,
+    const paddle::Tensor &token_nums_per_expert_padded);
    const int token_nums_this_rank, const int token_nums_this_rank_padded);
 std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
                                          const int block_size);
@@ -180,20 +175,35 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
    const paddle::optional<paddle::Tensor> &ffn2_bias,
    const bool norm_topk_prob, const float routed_scaling_factor);
-std::vector<std::vector<int>> GetExpertTokenNum(
+std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
-    const paddle::Tensor& topk_ids,
+                                                const int num_experts);
    const int num_experts);
 paddle::Tensor MoeExpertFFNFunc(
-    const paddle::Tensor &permute_input,
+    const paddle::Tensor& permute_input,
-    const paddle::Tensor &tokens_expert_prefix_sum,
+    const paddle::Tensor& tokens_expert_prefix_sum,
-    const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
+    const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
-    const paddle::optional<paddle::Tensor> &ffn1_bias,
+    const paddle::optional<paddle::Tensor>& ffn1_bias,
-    const paddle::optional<paddle::Tensor> &ffn1_scale,
+    const paddle::optional<paddle::Tensor>& ffn1_scale,
-    const paddle::optional<paddle::Tensor> &ffn2_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_scale,
-    const paddle::optional<paddle::Tensor> &ffn2_in_scale,
+    const paddle::optional<paddle::Tensor>& ffn2_in_scale,
-    const paddle::optional<paddle::Tensor> &expert_idx_per_token,
+    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
-    const std::string &quant_method, const bool used_in_ep_low_latency);
+    const std::string& quant_method, const bool used_in_ep_low_latency);
 paddle::Tensor MoeExpertFFNWint2Func(
    const paddle::Tensor& permute_input,
    const paddle::Tensor& tokens_expert_prefix_sum,
    const paddle::Tensor& ffn1_weight,
    const paddle::Tensor& ffn2_weight,
    const paddle::optional<paddle::Tensor>& ffn1_bias,
    const paddle::optional<paddle::Tensor>& ffn1_scale,
    const paddle::optional<paddle::Tensor>& ffn2_scale,
    const paddle::optional<paddle::Tensor>& ffn1_local_scale,
    const paddle::optional<paddle::Tensor>& ffn1_code_scale,
    const paddle::optional<paddle::Tensor>& ffn1_code_zp,
    const paddle::optional<paddle::Tensor>& ffn2_local_scale,
    const paddle::optional<paddle::Tensor>& ffn2_code_scale,
    const paddle::optional<paddle::Tensor>& ffn2_code_zp,
    const bool used_in_ep_low_latency);
 paddle::Tensor MoeExpertReduceFunc(
    const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
@@ -205,19 +215,16 @@ paddle::Tensor MoeExpertReduceFunc(
 void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
                          const paddle::Tensor &seq_lens_this_time_tensor,
                          const paddle::Tensor &seq_lens_decoder_tensor,
-                          const int rank,
+                          const int rank, const int num_layers);
                          const int num_layers);
-void GetOutputKVSignal(const paddle::Tensor& x,
+void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id,
                       int64_t rank_id,
                       bool wait_flag);
 paddle::Tensor DequantInt8Func(const paddle::Tensor &input,
                               const paddle::Tensor &out_scale,
                               std::string dtype);
-paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
+paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
                                           const bool keep_pd_step_flag);
 paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
@@ -286,61 +293,121 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
    const paddle::Tensor &seq_lens_this_time,
    const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
-std::vector<paddle::Tensor> MoEDeepGEMMPermute(
+std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
-    const paddle::Tensor& x,
+                                               const paddle::Tensor &topk_idx,
-    const paddle::Tensor& topk_idx,
+                                               const int num_experts,
-    const int num_experts,
+                                               const int max_tokens_per_expert);
    const int max_tokens_per_expert
 );
 std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
-    const paddle::Tensor& ffn_out, // [num_experts, max_tokens_per_expert, hidden]
+    const paddle::Tensor
-    const paddle::Tensor& permute_indices_per_token, // [token_num, topk}]
+        &ffn_out, // [num_experts, max_tokens_per_expert, hidden]
-    const paddle::Tensor& topk_idx,
+    const paddle::Tensor &permute_indices_per_token, // [token_num, topk}]
-    const paddle::Tensor& topk_weights
+    const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
-);
+
 void TextImageIndexOut(const paddle::Tensor &token_type_ids,
                       const paddle::Tensor &text_input,
                       const paddle::Tensor &image_input);
 void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
                            paddle::Tensor &image_input,
                            paddle::Tensor &token_type_ids,
                            paddle::Tensor &text_index,
                            paddle::Tensor &image_index, const bool is_scatter);
 paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
                                            int64_t num_experts);
 std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M);
 std::vector<paddle::Tensor> MoeWna16MarlinGemmApi(
    const paddle::Tensor& a,
    const paddle::optional<paddle::Tensor>& c_or_none,
    const paddle::Tensor& b_q_weight,
    const paddle::Tensor& b_scales,
    const paddle::optional<paddle::Tensor>& global_scale_or_none,
    const paddle::optional<paddle::Tensor>& b_zeros_or_none,
    const paddle::optional<paddle::Tensor>& g_idx_or_none,
    const paddle::optional<paddle::Tensor>& perm_or_none,
    const paddle::Tensor& workspace,
    const paddle::Tensor& sorted_token_ids,
    const paddle::Tensor& expert_ids,
    const paddle::Tensor& num_tokens_post_padded,
    const paddle::Tensor& topk_weights,
    int64_t moe_block_size,
    int64_t top_k,
    bool mul_topk_weights,
    bool is_ep,
    const std::string& b_q_type_str,
    int64_t size_m,
    int64_t size_n,
    int64_t size_k,
    bool is_k_full,
    bool use_atomic_add,
    bool use_fp32_reduce,
    bool is_zp_float);
 void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a,
                     paddle::Tensor const &b, paddle::Tensor const &a_scales,
                     paddle::Tensor const &b_scales,
                     paddle::optional<paddle::Tensor> const &bias);
 void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a,
                           paddle::Tensor const& b,
                           paddle::Tensor const& a_scales,
                           paddle::Tensor const& b_scales,
                           paddle::Tensor const& azp_adj,
                           paddle::optional<paddle::Tensor> const& azp,
                           paddle::optional<paddle::Tensor> const& bias);
 void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
                          paddle::Tensor const &scale);
 void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input,
                           paddle::Tensor &scale);
 void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out,
                                   paddle::Tensor const &input,
                                   paddle::Tensor &scales, float scale_ub);
 PYBIND11_MODULE(fastdeploy_ops, m) {
-      m.def("get_expert_token_num", &GetExpertTokenNum,
+  m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
-            py::arg("topk_ids"), py::arg("num_experts"),
+        py::arg("num_experts"), "get expert token num");
            "get expert token num");
  /**
   * moe/fused_moe/moe_redundant_topk_select.cu
   * moe_redundant_topk_select
   */
  m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
        py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
        py::arg("expert_in_rank_num_list"),
        py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
        py::arg("moe_topk"), py::arg("apply_norm_weight"),
        py::arg("enable_softmax_top_k_fused"),
        py::arg("redundant_ep_rank_num_plus_one"),
        "moe export RedundantTopKSelect function");
-      /**
+  /**
-      * moe/fused_moe/moe_redundant_topk_select.cu
+   * open_shm_and_get_meta_signal.cc
-      * moe_redundant_topk_select
+   * InitKVSignalPerQuery
-      */
+   */
-      m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
+  m.def("init_kv_signal_per_query", &InitKVSignalPerQuery,
-            py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
+        py::arg("seq_lens_encoder_tensor"),
-            py::arg("expert_in_rank_num_list"), py::arg("tokens_per_expert_stats_list"),
+        py::arg("seq_lens_this_time_tensor"),
-            py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"),
+        py::arg("seq_lens_decoder_tensor"), py::arg("rank"),
-            py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"),
+        py::arg("num_layers"), "init_kv_signal_per_query function");
            "moe export RedundantTopKSelect function");
  /**
   * GetOutputKVSignal
   */
  m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"),
        py::arg("rank_id"), py::arg("wait_flag"),
        "get_output_kv_signal function");
-      /**
+  m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
-      * open_shm_and_get_meta_signal.cc
+  m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute,
-      * InitKVSingnalPerQuery
+        "MoEDeepGEMMDePermute");
      */
      m.def("init_kv_signal_per_query", &InitKVSignalPerQuery, 
            py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"),
            py::arg("seq_lens_decoder_tensor"), py::arg("rank"), py::arg("num_layers"),
            "init_kv_signal_per_query function");
      /**
      * GetOutputKVSignal
      */
      m.def("get_output_kv_signal", &GetOutputKVSignal, 
            py::arg("x"), py::arg("rank_id"), py::arg("wait_flag"),
            "get_output_kv_signal function");
      m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
      m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
  /**
   * alloc_cache_pinned.cc
   * cuda_host_alloc
@@ -398,12 +465,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
        py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
        py::arg("moe_quant_type"), "ep moe export dispatch function");
-  m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8, py::arg("input"),
+  m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8);
        py::arg("scale"), py::arg("topk_ids"), py::arg("topk_weights"),
        py::arg("token_nums_per_expert"),
        py::arg("token_nums_per_expert_padded"),
        py::arg("token_nums_this_rank"), py::arg("token_nums_this_rank_padded"),
        "ep moe export dispatch function");
  m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
        py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
@@ -437,6 +499,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   */
  m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
  /**
   * moe/fused_moe/moe_ffn_wint2.cu
   * moe_expert_ffn_wint2
   */
  m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
  /**
   * moe/fused_moe/moe_expert_reduce.cu
   * moe_expert_reduce
@@ -523,4 +591,66 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
        "group_swiglu_with_masked function");
  m.def("text_image_index_out", &TextImageIndexOut,
        "text_image_index_out function");
  m.def("text_image_gather_scatter", &TextImageGatherScatter,
        "text_image_gather_scatter function");
  m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
  m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
  m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
  py::arg("a"),
  py::arg("c_or_none"),
  py::arg("b_q_weight"),
  py::arg("b_scales"),
  py::arg("global_scale_or_none"),
  py::arg("b_zeros_or_none"),
  py::arg("g_idx_or_none"),
  py::arg("perm_or_none"),
  py::arg("workspace"),
  py::arg("sorted_token_ids"),
  py::arg("expert_ids"),
    py::arg("num_tokens_post_padded"),
  py::arg("topk_weights"),
  py::arg("moe_block_size"),
    py::arg("top_k"),
      py::arg("mul_topk_weights"),
        py::arg("is_ep"),
          py::arg("b_q_type_str"),
            py::arg("size_m"),
              py::arg("size_n"),
              py::arg("size_k"),
              py::arg("is_k_full"),
              py::arg("use_atomic_add"),
              py::arg("use_fp32_reduce"),
              py::arg("is_zp_float"));
  /**
   * cutlass_scaled_mm.cu
   * cutlass_scaled_mm
   * cutlass_scaled_mm_azp
   */
  m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function");
  m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function");
  /**
   * quantization/common.cu
   * static_scaled_fp8_quant
   * dynamic_scaled_fp8_quant
   * dynamic_per_token_scaled_fp8_quant
   */
  m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function",
      py::arg("out"), py::arg("input"), py::arg("scale"));
  m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant,
        "dynamic_scaled_fp8_quant function",
        py::arg("out"), py::arg("input"), py::arg("scale"));
  m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
        "dynamic_per_token_scaled_fp8_quant function",
         py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
 }
--- a/custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/arch/memory_copy_sm80.h
@@ -0,0 +1,250 @@
 /***************************************************************************************************
 * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
    \brief Architecture-specific operators on memory added for SM80
 */
 #pragma once
 #include "cutlass/cutlass.h"
 #include "cutlass/complex.h"
 #include "cutlass/arch/memory.h"
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/memory_sm80.h"
 #include "cutlass/arch/cache_operation.h"
 namespace cutlass {
 namespace arch {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Initiates an asynchronous copy from global memory to shared memory.
 ///
 /// cp.async
 ///
 template <
    /// Size of the access in bytes
    int SizeInBytes,
    /// Cache operation
    CacheOperation::Kind cache_op = CacheOperation::Always,
    bool GlobalToShared = true>
 struct copy;
 /// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate
 /// the entire transfer, zeros are written to SMEM if the guard predicate is false.
 ///
 /// cp.async
 ///
 template <
    /// Size of the access in bytes
    int SizeInBytes,
    /// Cache operation
    CacheOperation::Kind cache_op = CacheOperation::Always,
    bool GlobalToShared = true>
 struct copy_zfill;
 /// Blocks until all but <N> previous cp.async.commit_group operations have committed.
 ///
 /// cp.async
 ///
 template <int N, bool GlobalToShared = true>
 struct copy_wait;
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy<SizeInBytes, CacheOperation::Always, true> {
  /// Copy
  CUTLASS_DEVICE
  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      cp_async<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy<SizeInBytes, CacheOperation::Always, false> {
  /// Copy
  CUTLASS_DEVICE
  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      using AccessType  = Array<uint8_t, SizeInBytes>;
      if (pred_guard) {
        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
      }
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy_zfill<SizeInBytes, CacheOperation::Always, true> {
  /// Copy with zero fill
  CUTLASS_DEVICE
  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
      cp_async_zfill<SizeInBytes, CacheOperation::Always>(smem_ptr, global_ptr, pred_guard);
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy_zfill<SizeInBytes, CacheOperation::Always, false> {
  /// Copy with zero fill
  CUTLASS_DEVICE
  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) {
      using AccessType  = Array<uint8_t, SizeInBytes>;
      if (pred_guard) {
        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
      }
      else {
        AccessType zeros;
        zeros.clear();
        *static_cast<AccessType *>(smem_ptr) = zeros;
      }
  }
 };
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy<SizeInBytes, CacheOperation::Global, true> {
  /// Copy
  CUTLASS_DEVICE
  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      cp_async<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy<SizeInBytes, CacheOperation::Global, false> {
  /// Copy
  CUTLASS_DEVICE
  copy(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      using AccessType  = Array<uint8_t, SizeInBytes>;
      if (pred_guard) {
        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
      }
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy_zfill<SizeInBytes, CacheOperation::Global, true> {
  /// Copy with zero fill
  CUTLASS_DEVICE
  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      cp_async_zfill<SizeInBytes, CacheOperation::Global>(smem_ptr, global_ptr, pred_guard);
  }
 };
 /// Partial specialization
 template <
    /// Size of the access in bytes
    int SizeInBytes>
 struct copy_zfill<SizeInBytes, CacheOperation::Global, false> {
  /// Copy with zero fill
  CUTLASS_DEVICE
  copy_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) {
      using AccessType  = Array<uint8_t, SizeInBytes>;
      if (pred_guard) {
        *static_cast<AccessType *>(smem_ptr) = *static_cast<AccessType const *>(global_ptr);
      }
      else {
        AccessType zeros;
        zeros.clear();
        *static_cast<AccessType *>(smem_ptr) = zeros;
      }
  }
 };
 /// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
 template <bool GlobalToShared>
 CUTLASS_DEVICE
 void copy_fence() {}
 template <>
 CUTLASS_DEVICE
 void copy_fence<true>() {
  cp_async_fence();
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Partial specialization
 template <int N>
 struct copy_wait<N, false> {
  CUTLASS_DEVICE
  copy_wait() {}
 };
 /// Partial specialization
 template <int N>
 struct copy_wait<N, true> {
  CUTLASS_DEVICE
  copy_wait() { cp_async_wait<N>(); }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
 }  // namespace arch
 }  // namespace cutlass
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/Show More
+++ b/Show More