get org_vocab_size from args (#3981 )

Update requirements.txt (#3915 )
[BugFix] fix max streaming tokens invalid (#3774 ) (#3856 )
2025-09-29 22:02:30 +08:00 · 2025-09-09 15:08:47 +08:00 · 2025-09-05 13:51:22 +08:00 · 2025-09-03 17:50:29 +08:00 · 2025-09-02 21:00:29 +08:00 · 2025-08-27 14:23:38 +08:00
579 changed files with 36457 additions and 19065 deletions
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
+max-line-length = 119
+
+# E402: module level import not at top of file
+per-file-ignores =
+    __init__.py:F401,F403,E402
--- a/.github/workflows/Codestyle-Check.yml
+++ b/.github/workflows/Codestyle-Check.yml
@@ -0,0 +1,48 @@
+name: Codestyle-Check
+
+on:
+  pull_request:
+    branches: ["develop"]
+
+jobs:
+  pre-commit:
+    name: Pre Commit
+    if: ${{ github.repository_owner == 'PaddlePaddle' }}
+    runs-on: ubuntu-latest
+    env:
+      PR_ID: ${{ github.event.pull_request.number }}
+      BRANCH: develop
+
+    steps:
+      - name: Cleanup
+        run: |
+          rm -rf * .[^.]*
+
+      - name: Checkout base repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          fetch-depth: 1000
+
+      - name: Merge PR to test branch
+        run: |
+          git fetch origin pull/${PR_ID}/merge
+          git checkout -b test FETCH_HEAD
+
+      - name: Setup python3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install pre-commit==4.2.0 cpplint==1.6.0 clang-format==13.0.0
+
+      - name: Check pre-commit
+        env:
+          SKIP_CLANG_TIDY_CHECK: "ON"
+        run: |
+          set +e
+          bash -x tools/codestyle/pre_commit.sh;EXCODE=$?
+          exit $EXCODE
--- a/.github/workflows/_build_linux.yml
+++ b/.github/workflows/_build_linux.yml
@@ -0,0 +1,174 @@
+name: FastDeploy Linux GPU Build Task
+description: "FastDeploy packages build and upload"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      COMPILE_ARCH:
+        description: "Build GPU Archs"
+        required: true
+        type: string
+        default: "80,90"
+      WITH_NIGHTLY_BUILD:
+        description: "Enable nightly build mode (e.g. add date suffix to version)"
+        required: false
+        type: string
+        default: "ON"
+      FD_VERSION:
+        description: "FastDeploy Package Version"
+        required: false
+        type: string
+        default: ""
+      UPLOAD:
+        description: "Upload Package"
+        required: false
+        type: string
+        default: "ON"
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+    outputs:
+      wheel_path:
+        description: "Output path of the generated wheel"
+        value: ${{ jobs.fd-build.outputs.wheel_path }}
+jobs:
+  fd-build:
+    runs-on: [self-hosted, GPU-h1z1-4Cards]
+    outputs:
+      wheel_path: ${{ steps.set_output.outputs.wheel_path }}
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+          IS_PR: ${{ github.event_name == 'pull_request' }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+      - name: FastDeploy Build
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          compile_arch: ${{ inputs.COMPILE_ARCH }}
+          fd_version: ${{ inputs.FD_VERSION }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+        run: |
+            set -x
+            runner_name="${{ runner.name }}"
+            CARD_ID=$(echo "${runner_name}" | cut -d'-' -f2)
+            gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+
+            CACHE_DIR=${CACHE_DIR:-${{ github.workspace }}}
+            echo "CACHE_DIR is set to ${CACHE_DIR}"
+            if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+              touch "${CACHE_DIR}/gitconfig"
+            fi
+            PARENT_DIR=$(dirname "$WORKSPACE")
+            echo "PARENT_DIR:$PARENT_DIR"
+            docker run --rm --net=host \
+            --cap-add=SYS_PTRACE --privileged --shm-size=64G \
+            -v $(pwd):/workspace -w /workspace \
+            -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+            -v "${CACHE_DIR}/.cache:/root/.cache" \
+            -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+            -e TZ="Asia/Shanghai" \
+            -e "COMPILE_ARCH=${compile_arch}" \
+            -e "FD_VERSION=${fd_version}" \
+            -e "WITH_NIGHTLY_BUILD=${WITH_NIGHTLY_BUILD}" \
+            --gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
+            if [[ -n "${FD_VERSION}" ]]; then
+              export FASTDEPLOY_VERSION=${FD_VERSION}
+              echo "Custom FastDeploy version: ${FASTDEPLOY_VERSION}"
+            fi
+
+            git config --global --add safe.directory /workspace/FastDeploy
+            cd FastDeploy
+            if [[ "${WITH_NIGHTLY_BUILD}" == "ON" ]];then
+              GIT_COMMIT_TIME=$(git --no-pager show -s --format=%ci HEAD)
+              DATE_ONLY=$(echo $GIT_COMMIT_TIME | sed "s/ .*//;s/-//g")
+              echo "Git Commit Time: $GIT_COMMIT_TIME"
+              echo "Date Only: $DATE_ONLY"
+              export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
+            fi
+            pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
+            pip config set install.trusted-host  pip.baidu.com
+            pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+            python -m pip install --upgrade pip
+            python -m pip install -r requirements.txt
+            python -m pip install wheel
+            python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+            # 编译RDMA
+            export ENABLE_FD_RDMA=1
+            bash build.sh 1 python false [${COMPILE_ARCH}]
+            ls ./dist/*.whl
+            '
+      - name: Package Upload
+        id: set_output
+        env:
+          compile_arch: ${{ inputs.COMPILE_ARCH }}
+        run: |
+            set -x
+            if [[ "${{ github.event_name }}" == "pull_request" ]];then
+              commit_id=${{ github.event.pull_request.head.sha }}
+              pr_num=${{ github.event.pull_request.number }}
+              target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
+            elif [[ "${{ github.ref_type }}" == "tag" ]]; then
+              commit_id=${{ github.sha }}
+              tag_name=${{ github.ref_name }}
+              target_path=paddle-github-action/TAG/FastDeploy/${tag_name}/${commit_id}/SM${compile_arch//,/_}
+            else
+              commit_id=${{ github.sha }}
+              branch_name=${{ github.ref_name }}
+              target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}/SM${compile_arch//,/_}
+            fi
+            wget  -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
+            push_file=$(realpath bos_tools.py)
+            python --version
+            python -m pip install bce-python-sdk==0.9.29
+            cd FastDeploy/dist/
+            matches=($(ls fastdeploy*.whl))
+            if [ ${#matches[@]} -ne 1 ]; then
+              echo "Error: Found ${#matches[@]} matching files, expected exactly 1"
+              exit 1
+            fi
+            fd_wheel_name=${matches[0]}
+            echo "Found: $fd_wheel_name"
+            tree -L 3
+            python ${push_file} fastdeploy*.whl ${target_path}
+            target_path_stripped="${target_path#paddle-github-action/}"
+            WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
+            echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT
--- a/.github/workflows/_clone_linux.yml
+++ b/.github/workflows/_clone_linux.yml
@@ -0,0 +1,78 @@
+name: FastDeploy Code Clone
+description: "FastDeploy clone and upload"
+
+on:
+  workflow_call:
+    inputs:
+        bos_dir:
+          type: string
+          required: false
+          default: 'FastDeploy'
+    outputs:
+      repo_archive_url:
+        description: "Compressed source code archive."
+        value: ${{ jobs.code-clone.outputs.repo_archive_url }}
+jobs:
+  code-clone:
+    runs-on:
+      group: HK-Clone
+    outputs:
+      repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
+    steps:
+      - name: Clone FastDeploy
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request'
+                && github.event.pull_request.base.ref
+                || github.ref_name }}
+          submodules: 'recursive'
+          fetch-depth: 1000
+
+      - name: Merge PR (if needed)
+        if: ${{ github.event_name == 'pull_request' }}
+        run: |
+          git config --global user.name "FastDeployCI"
+          git config --global user.email "fastdeploy_ci@example.com"
+          echo "Fetching and merging PR..."
+          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
+          git merge --no-ff pr/${{ github.event.pull_request.number }}
+          echo "PR Branch log "
+          git log --oneline -n 5 pr/${{ github.event.pull_request.number }}
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Code Info Show and Upload
+        id: set_output
+        env:
+          AK: paddle
+          SK: paddle
+        run: |
+          git config --unset http.https://github.com/.extraheader
+          git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
+          git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
+          echo "Current HEAD Log:"
+          git log --oneline -n 5
+          ls
+          cd ..
+          tar -zcf FastDeploy.tar.gz FastDeploy
+          if [[ "${{ github.event_name }}" == "pull_request" ]];then
+            commit_id=${{ github.event.pull_request.head.sha }}
+            pr_num=${{ github.event.pull_request.number }}
+            target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}
+          elif [[ "${{ github.ref_type }}" == "tag" ]]; then
+            commit_id=${{ github.sha }}
+            tag_name=${{ github.ref_name }}
+            target_path=paddle-github-action/TAG/FastDeploy/${tag_name}/${commit_id}
+          else
+            commit_id=${{ github.sha }}
+            branch_name=${{ github.ref_name }}
+            target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}
+          fi
+          wget  -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
+          push_file=$(realpath bos_tools.py)
+          python -m pip install bce-python-sdk==0.9.29
+          ls
+          python ${push_file} FastDeploy.tar.gz ${target_path}
+          target_path_stripped="${target_path#paddle-github-action/}"
+          REPO_ARCHIVE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/FastDeploy.tar.gz
+          echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,9 @@ name: CI

 on:
  pull_request:
-    branches: [ develop ]
+    branches:
+      - develop
+      - 'release/*'
  workflow_dispatch:

 concurrency:
@@ -27,9 +29,11 @@ jobs:
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
          REPO_NAME="${FULL_REPO##*/}"
+          BASE_BRANCH="${{ github.base_ref }}"
          # Clean the repository directory before starting
          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
          -e "REPO_NAME=${REPO_NAME}" \
+          -e "BASE_BRANCH=${BASE_BRANCH}" \
          ${docker_image} /bin/bash -c '
            if [ -d ${REPO_NAME} ]; then
              echo "Directory ${REPO_NAME} exists, removing it..."
@@ -38,7 +42,7 @@ jobs:
          '
          git config --global user.name "FastDeployCI"
          git config --global user.email "fastdeploy_ci@example.com"
-          git clone ${REPO} ${REPO_NAME}
+          git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
          cd FastDeploy
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
@@ -56,14 +60,36 @@ jobs:
          runner_name="${{ runner.name }}"
          last_char="${runner_name: -1}"

-          if [[ "$last_char" =~ [0-3] ]]; then
-            gpu_id="$last_char"
+          if [ "${last_char}" = "1" ]; then
+            gpu_id=2
+            DEVICES="2,3"
          else
-            gpu_id="0"  
+            gpu_id=0
+            DEVICES="0,1"
          fi
-          FD_API_PORT=$((9180 + gpu_id * 100))
-          FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
-          FD_METRICS_PORT=$((9170 + gpu_id * 100))
+          
+          FLASK_PORT=$((41068 + gpu_id * 100))
+          FD_API_PORT=$((41088 + gpu_id * 100))
+          FD_ENGINE_QUEUE_PORT=$((41058 + gpu_id * 100))
+          FD_METRICS_PORT=$((41078 + gpu_id * 100))
+          
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE

          PARENT_DIR=$(dirname "$WORKSPACE")
          echo "PARENT_DIR:$PARENT_DIR"
@@ -76,8 +102,8 @@ jobs:
          -e "FD_API_PORT=${FD_API_PORT}" \
          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-          --gpus device=${gpu_id} ${docker_image} /bin/bash -c "
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c "
          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
          bash scripts/run_ci.sh
-          "
+          "
--- a/.github/workflows/ci_iluvatar.yml
+++ b/.github/workflows/ci_iluvatar.yml
@@ -0,0 +1,84 @@
+name: CI_ILUVATAR
+
+on:
+  pull_request:
+    branches: [ develop ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}-iluvatar-ci
+  cancel-in-progress: true
+
+jobs:
+  CI_ILUVATAR:
+    runs-on: [self-hosted, IXUCA]
+    steps:
+      - name: Print current runner name
+        run: |
+          echo "Current runner name: ${{ runner.name }}"
+      # Because the system version is lower than 2.23, the checkout cannot be used.
+      # - name: Checkout code
+      #   uses: actions/checkout@v4
+
+      - name: Code Checkout
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+        run: |
+          REPO="https://github.com/${{ github.repository }}.git"
+          FULL_REPO="${{ github.repository }}"
+          REPO_NAME="${FULL_REPO##*/}"
+          # Clean the repository directory before starting
+          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+          -e "REPO_NAME=${REPO_NAME}" \
+          ${docker_image} /bin/bash -c '
+            if [ -d ${REPO_NAME} ]; then
+              echo "Directory ${REPO_NAME} exists, removing it..."
+              rm -rf ${REPO_NAME}
+            fi
+          '
+          git config --global user.name "FastDeployCI"
+          git config --global user.email "fastdeploy_ci@example.com"
+          git clone ${REPO} ${REPO_NAME}
+          cd FastDeploy
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
+            git merge pr/${{ github.event.pull_request.number }}
+            git log -n 3 --oneline
+          else
+            git checkout ${{ github.sha }}
+            git log -n 3 --oneline
+          fi
+
+      - name: Run CI unittest
+        env:
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+        run: |
+          runner_name="${{ runner.name }}"
+          last_char="${runner_name: -1}"
+
+          if [[ "$last_char" =~ [0-3] ]]; then
+            gpu_id="$last_char"
+          else
+            gpu_id="0"
+          fi
+          FD_API_PORT=$((9180 + gpu_id * 100))
+          FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
+          FD_METRICS_PORT=$((9170 + gpu_id * 100))
+
+          PARENT_DIR=$(dirname "$WORKSPACE")
+          echo "PARENT_DIR:$PARENT_DIR"
+          docker run --rm --net=host --pid=host --cap-add=ALL --privileged --shm-size=64G  \
+          -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev \
+          -v $(pwd):/workspace -w /workspace \
+          -v "/data1/fastdeploy:/data1/fastdeploy" \
+          -e "MODEL_PATH=/ssd3/model" \
+          -e "http_proxy=$(git config --global --get http.proxy)" \
+          -e "https_proxy=$(git config --global --get https.proxy)" \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+           ${docker_image} /bin/bash -c "
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          bash scripts/run_ci_iluvatar.sh
+          "
--- a/.github/workflows/ci_xpu.yml
+++ b/.github/workflows/ci_xpu.yml
@@ -2,7 +2,9 @@ name: CI_XPU

 on:
  pull_request:
-    branches: [ develop ]
+    branches:
+      - develop
+      - 'release/*'
  workflow_dispatch:

 concurrency:
@@ -10,7 +12,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  build:
+  CI_XPU:
    runs-on: [self-hosted, XPU-P800-8Card]
    steps:
      - name: Print current runner name
@@ -27,9 +29,11 @@ jobs:
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
          REPO_NAME="${FULL_REPO##*/}"
+          BASE_BRANCH="${{ github.base_ref }}"
          # Clean the repository directory before starting
          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
          -e "REPO_NAME=${REPO_NAME}" \
+          -e "BASE_BRANCH=${BASE_BRANCH}" \
          ${docker_image} /bin/bash -c '
            if [ -d ${REPO_NAME} ]; then
              echo "Directory ${REPO_NAME} exists, removing it..."
@@ -38,7 +42,7 @@ jobs:
          '
          git config --global user.name "FastDeployCI"
          git config --global user.email "fastdeploy_ci@example.com"
-          git clone ${REPO} ${REPO_NAME}
+          git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
          cd FastDeploy
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
@@ -59,7 +63,7 @@ jobs:
          if [[ "$last_char" =~ [0-3] ]]; then
            gpu_id="$last_char"
          else
-            gpu_id="0"  
+            gpu_id="0"
          fi
          FD_API_PORT=$((9180 + gpu_id * 100))
          FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
@@ -80,4 +84,4 @@ jobs:
          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
          bash scripts/run_ci_xpu.sh
-          "
+          "
--- a/.github/workflows/pr_build_and_test.yml
+++ b/.github/workflows/pr_build_and_test.yml
@@ -0,0 +1,35 @@
+name: PR Build and Test
+on:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+permissions: read-all
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  clone:
+    name: FD-Clone-Linux
+    uses: ./.github/workflows/_clone_linux.yml
+
+  build:
+    name: FD-Build-Linux
+    needs: clone
+    uses: ./.github/workflows/_build_linux.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      COMPILE_ARCH: "90"
+      WITH_NIGHTLY_BUILD: "OFF"
+      FD_VERSION: "0.0.0"
+
+  resultshow:
+    name: Use Build Output
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Print wheel path
+        run: |
+          echo "The built wheel is located at: ${{ needs.build.outputs.wheel_path }}"
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,5 @@ custom_ops/tmp*
 build

 .ccls-cache
+
+third_party
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,14 +3,30 @@ default_install_hook_types:
  - commit-msg
 default_stages:
  - pre-commit # Run locally
+  - commit-msg
 #   - manual # Run in CI
 repos:
+-   repo: https://github.com/psf/black.git
+    rev: 25.1.0
+    hooks:
+    -   id: black
+        files: \.(py|pyi)$
+        additional_dependencies: [toml]
+# 自动排序
+-   repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+    -   id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 7.0.0
+    hooks:
+    -   id: flake8
 # 代码检查
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.11.7
  hooks:
  - id: ruff
-    args: [--output-format, github, --fix, --line-length=120]
+    args: [--output-format, github, --fix, --line-length=120, --config, pyproject.toml]
 # # 拼写检查
 # - repo: https://github.com/codespell-project/codespell
 #   rev: v2.4.1
@@ -18,17 +34,13 @@ repos:
 #   - id: codespell
 #     additional_dependencies: ['tomli']
 #     args: ['--toml', 'pyproject.toml']
-# 自动排序
- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
+
 # markdown
 - repo: https://github.com/jackdewinter/pymarkdown
  rev: v0.9.29
  hooks:
  - id: pymarkdown
-    args: [fix]
+    args: ["-d", "MD029,MD031", fix]
 - repo: https://github.com/pre-commit/pre-commit-hooks
  rev: v5.0.0
  hooks:
--- a/README.md
+++ b/README.md
@@ -8,14 +8,17 @@
    <a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/FastDeploy?color=9cc"></a>
    <a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
+
 </p>

 <p align="center">
+     <a href="https://trendshift.io/repositories/4046" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4046" alt="PaddlePaddle%2FFastDeploy | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></br>
    <a href="https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/"><b> Installation </b></a>
    |
    <a href="https://paddlepaddle.github.io/FastDeploy/get_started/quick_start"><b> Quick Start </b></a>
    |
    <a href="https://paddlepaddle.github.io/FastDeploy/supported_models/"><b> Supported Models </b></a>
+
 </p>

 --------------------------------------------------------------------------------
@@ -23,6 +26,10 @@

 ## News

+**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务，即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金！🎁 欢迎大家体验反馈～ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#)   📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
+
+**[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
+
 **[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.

 ## About
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -41,7 +41,10 @@ python -m pip install -r requirements.txt
 --metric-percentiles 80,95,99,99.9,99.95,99.99：性能结果中展示的性能指标分位值
 --num-prompts 1：总计发送多少条请求
 --max-concurrency 1：压测并发数
--save-result：开启结果保存，结果文件会存入json
+--save-result：开启结果保存，结果文件会存入json，默认False不保存
+--debug：开启debug模式，逐条打印payload和output内容，默认False
+--shuffle：是否打乱数据集，默认False不打乱
+--seed：打乱数据集时的随机种子，默认0
 ```

 ##### /v1/chat/completions接口压测单条数据调试
@@ -105,3 +108,30 @@ python benchmark_serving.py \
  --save-result > infer_log.txt 2>&1 &
 ```

+### 投机解码性能测试工具
+
+#### 使用方式：
+
+```bash
+python benchmarks/benchmark_mtp.py \
+  --host 127.0.0.1 --port 8000 \
+  --max-concurrency 16 32 64 96 --num-prompts 256 \
+  --acceptance-rate 0.8 --draft-token-steps 1 2 3 \
+  --s_itl-base-model 15.88 22.84 16.47 16.93 \
+  --dataset-name EBChat \
+  --dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json
+```
+
+#### 参数说明
+
+```bash
+--host：服务ip地址，用于组url
+--port：服务HTTP端口，用于组url
+--max-concurrency：测试并发数
+--num-prompts：总计发送多少条请求
+--acceptance-rate：投机解码的模拟接受率
+--draft-token-steps：投机解码的步数
+--s_itl-base-model：主模型的解码延迟，可由上述的性能压测工具获得，与batch-size一一对应
+--dataset-name：指定数据集类，指定为"EBChat"可读取转存的FD格式数据集
+--dataset-path：测试数据集路径
+```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -29,13 +29,14 @@ from typing import Optional
 import aiohttp
 from tqdm.asyncio import tqdm

-
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


@dataclass
 class RequestFuncInput:
    """Input for requesting LLMs via API"""
+
+    no: int
    prompt: str
    history_QA: Optional[dict]
    hyper_parameters: dict
@@ -49,11 +50,14 @@ class RequestFuncInput:
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
    language: Optional[str] = None
+    debug: bool = False


@dataclass
 class RequestFuncOutput:
    """Output for requesting LLMs via API"""
+
+    no: int = 0
    generated_text: str = ""
    reasoning_content: str = ""
    success: bool = False
@@ -64,7 +68,7 @@ class RequestFuncOutput:
    itl: list = field(default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
-    prompt_tokens: int = 0 # 推理侧返回输入token数
+    prompt_tokens: int = 0  # 推理侧返回输入token数
    error: str = ""


@@ -74,22 +78,19 @@ async def async_request_eb_openai_chat_completions(
 ) -> RequestFuncOutput:
    """Request an LLM using EB OpenAI"""
    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Chat Completions API URL must end with 'completions'."
+    assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
-            "model": "default",
+            "model": request_func_input.model,
            "messages": request_func_input.history_QA,
            "stream": True,
            "stream_options": {
                "include_usage": True,
-                "continuous_usage_stats": True
+                "continuous_usage_stats": True,
            },
        }
        # 超参由yaml传入
@@ -97,6 +98,10 @@ async def async_request_eb_openai_chat_completions(

        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        if request_func_input.debug:
+            print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
+
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@@ -104,21 +109,20 @@ async def async_request_eb_openai_chat_completions(

        output = RequestFuncOutput()
        output.prompt_len = 0
+        output.no = request_func_input.no

        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, type(chunk))
                            timestamp = time.perf_counter()
@@ -132,21 +136,20 @@ async def async_request_eb_openai_chat_completions(
                                    ttft = timestamp - st
                                    output.ttft = ttft
                                    # cached_tokens
-                                    output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
+                                    output.prompt_len = (
+                                        data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
+                                    )

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                output.generated_text += content or ""
                                output.reasoning_content += reason_content or ""
-                                output.arrival_time.append(choices[0].get("arrival_time"))
-                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
-                                output.prompt_tokens = usage.get(
-                                    "prompt_tokens")
+                                output.arrival_time.append(choices[0].get("arrival_time", timestamp))
+                            elif usage := data.get("usage", {}):
+                                output.output_tokens = usage.get("completion_tokens", 0)
+                                output.prompt_tokens = usage.get("prompt_tokens", 0)

                            most_recent_timestamp = timestamp

@@ -159,7 +162,12 @@ async def async_request_eb_openai_chat_completions(
                    output.latency = most_recent_timestamp - st
                else:
                    error_text = await response.text()
-                    print("####error response:", error_text, "####payload:", payload)
+                    print(
+                        "####error response:",
+                        error_text,
+                        "####payload:",
+                        payload,
+                    )
                    output.error = error_text or ""
                    output.success = False
        except Exception:
@@ -173,6 +181,8 @@ async def async_request_eb_openai_chat_completions(
                f.write(str(output) + "\n")
    if pbar:
        pbar.update(1)
+    if request_func_input.debug:
+        print("#####final_output:", output)
    return output


@@ -186,15 +196,14 @@ async def async_request_eb_openai_completions(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
-            "model": "default",
+            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "stream": True,
            "stream_options": {
                "include_usage": True,
-                "continuous_usage_stats": True
+                "continuous_usage_stats": True,
            },
        }
        # 超参由yaml传入
@@ -202,19 +211,25 @@ async def async_request_eb_openai_completions(

        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        if request_func_input.debug:
+            print("payload:", json.dumps(payload, ensure_ascii=False))
+
        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "Content-Type": "application/json",
        }

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
+        output.no = request_func_input.no

        generated_text = ""
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
@@ -222,10 +237,10 @@ async def async_request_eb_openai_completions(
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, chunk.usage)
+                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

                            # NOTE: Some completion API might have a last
@@ -235,35 +250,40 @@ async def async_request_eb_openai_completions(
                                # Note that text could be empty here
                                # e.g. for special tokens
                                text = choices[0].get("text")
-                                timestamp = time.perf_counter()
+
                                # First token
                                if not first_chunk_received:
                                    first_chunk_received = True
-                                    ttft = time.perf_counter() - st
+                                    ttft = timestamp - st
                                    output.ttft = ttft

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += text or ""

                                most_recent_timestamp = timestamp
-                                output.arrival_time.append(choices[0].get("arrival_time"))
-                                generated_text += text or ""
+                                output.arrival_time.append(choices[0].get("arrival_time", timestamp))
                            elif usage := data.get("usage"):
-                                output.prompt_tokens = usage.get(
-                                    "prompt_tokens")
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.prompt_tokens = usage.get("prompt_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
-                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!")
+                            "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+                        )
+
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
+
+                    if output.generated_text == "":
+                        output.success = False
+                        output.error = "No generated text found!"
+                    else:
+                        output.success = True
                else:
                    output.error = response.reason or ""
                    output.success = False
@@ -272,6 +292,9 @@ async def async_request_eb_openai_completions(
            exc_info = sys.exc_info()
            output.error = "".join(traceback.format_exception(*exc_info))

+        if request_func_input.debug:
+            print(f"final_output:{output}")
+
    if pbar:
        pbar.update(1)
    return output
@@ -285,8 +308,7 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        params = {
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
@@ -333,8 +355,7 @@ async def async_request_tgi(

                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
+                            output.itl.append(timestamp - most_recent_timestamp)

                        most_recent_timestamp = timestamp
                        output.arrival_time.append(data["arrival_time"])
@@ -363,8 +384,7 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
@@ -389,8 +409,7 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data:")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")

                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@@ -402,8 +421,7 @@ async def async_request_trt_llm(

                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
+                            output.itl.append(timestamp - most_recent_timestamp)

                        most_recent_timestamp = timestamp

@@ -428,8 +446,7 @@ async def async_request_deepspeed_mii(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    """Request an LLM using Deepspeed MII"""
-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:

        payload = {
            "prompt": request_func_input.prompt,
@@ -447,19 +464,16 @@ async def async_request_deepspeed_mii(

        st = time.perf_counter()
        try:
-            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as response:
+            async with session.post(url=request_func_input.api_url, json=payload) as response:
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
-                            "text"]
+                        output.generated_text = parsed_resp["choices"][0]["text"]
                    elif "text" in parsed_resp:
                        output.generated_text = parsed_resp["text"][0]
                    else:
-                        output.error = ("Unexpected response format: "
-                                        "neither 'choices' nor 'text' found")
+                        output.error = "Unexpected response format: " "neither 'choices' nor 'text' found"
                        output.success = False
                    output.success = True
                else:
@@ -485,26 +499,22 @@ async def async_request_openai_completions(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
+            "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
            "prompt": request_func_input.prompt,
            # "temperature": 0.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
-            #"stream_options": {
+            # "stream_options": {
            #    "include_usage": True,
-            #},
+            # },
        }
        if request_func_input.ignore_eos:
            payload["ignore_eos"] = request_func_input.ignore_eos

-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}

        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@@ -513,8 +523,7 @@ async def async_request_openai_completions(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
@@ -522,8 +531,7 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                            "data: ")
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
                            # print("####chunk:", chunk, type(chunk))
                            data = json.loads(chunk)
@@ -544,21 +552,19 @@ async def async_request_openai_completions(

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
-                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!")
+                            "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+                        )
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
@@ -581,25 +587,24 @@ async def async_request_openai_audio(
    """Request an LLM using OpenAI"""
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
+
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        ("transcriptions", "translations"
-         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+        ("transcriptions", "translations")
+    ), "OpenAI Chat Completions API URL must end with 'transcriptions' "
    "or `translations`."

-    async with aiohttp.ClientSession(trust_env=True,
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        payload = {
-            "model": request_func_input.model_name \
-                if request_func_input.model_name else request_func_input.model,
+            "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "language": "en",
            # Flattened due to multipart/form-data
            "stream_include_usage": True,
-            "stream_continuous_usage_stats": True
+            "stream_continuous_usage_stats": True,
        }
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
@@ -614,9 +619,9 @@ async def async_request_openai_audio(
            buffer.seek(0)
            return buffer

-        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
            form = aiohttp.FormData()
-            form.add_field('file', f, content_type='audio/wav')
+            form.add_field("file", f, content_type="audio/wav")
            for key, value in payload.items():
                form.add_field(key, str(value))

@@ -628,24 +633,20 @@ async def async_request_openai_audio(
            st = time.perf_counter()
            most_recent_timestamp = st
            try:
-                async with session.post(url=api_url,
-                                        data=form,
-                                        headers=headers) as response:
+                async with session.post(url=api_url, data=form, headers=headers) as response:
                    if response.status == 200:
                        async for chunk_bytes in response.content:
                            chunk_bytes = chunk_bytes.strip()
                            if not chunk_bytes:
                                continue

-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                "data: ")
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)

                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
+                                    content = choices[0]["delta"].get("content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
@@ -653,13 +654,11 @@ async def async_request_openai_audio(

                                    # Decoding phase
                                    else:
-                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
+                                        output.itl.append(timestamp - most_recent_timestamp)

                                    generated_text += content or ""
                                elif usage := data.get("usage"):
-                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
+                                    output.output_tokens = usage.get("completion_tokens")

                                most_recent_timestamp = timestamp

@@ -693,8 +692,11 @@ ASYNC_REQUEST_FUNCS = {
 }

 OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_eb_openai_chat_completions)
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v
+    in (
+        async_request_openai_completions,
+        async_request_eb_openai_chat_completions,
+    )
 ]
-
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -26,9 +26,9 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from io import BytesIO
-from typing import Any, Callable, Optional, Union
-from PIL import Image
+from typing import Any, Optional, Union

+from PIL import Image

 logger = logging.getLogger(__name__)

@@ -39,6 +39,7 @@ class SampleRequest:
    Represents a single inference request for benchmarking.
    """

+    no: int
    prompt: Union[str, Any]
    history_QA: Union[str, Any]
    json_data: Optional[dict]
@@ -48,6 +49,7 @@ class SampleRequest:

 class BenchmarkDataset(ABC):
    """BenchmarkDataset"""
+
    DEFAULT_SEED = 0
    IS_MULTIMODAL = False

@@ -55,6 +57,7 @@ class BenchmarkDataset(ABC):
        self,
        dataset_path: Optional[str] = None,
        random_seed: int = DEFAULT_SEED,
+        shuffle: bool = False,
        hyperparameter_path: Optional[str] = None,
    ) -> None:
        """
@@ -68,9 +71,9 @@ class BenchmarkDataset(ABC):
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
-        self.random_seed = (random_seed
-                            if random_seed is not None else self.DEFAULT_SEED)
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
        self.data = None
+        self.shuffle = shuffle
        self.hyperparameter_path = hyperparameter_path
        self.hyperparameters = {}

@@ -85,8 +88,7 @@ class BenchmarkDataset(ABC):
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
-        raise NotImplementedError(
-            "load_data must be implemented in subclasses.")
+        raise NotImplementedError("load_data must be implemented in subclasses.")

    @abstractmethod
    def sample(self, num_requests: int) -> list[SampleRequest]:
@@ -105,8 +107,7 @@ class BenchmarkDataset(ABC):
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

-    def maybe_oversample_requests(self, requests: list[SampleRequest],
-                                  num_requests: int) -> None:
+    def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.
@@ -117,11 +118,9 @@ class BenchmarkDataset(ABC):
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
-            additional = random.choices(requests,
-                                        k=num_requests - len(requests))
+            additional = random.choices(requests, k=num_requests - len(requests))
            requests.extend(additional)
-            logger.info("Oversampled requests to reach %d total samples.",
-                        num_requests)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)


 def is_valid_sequence(
@@ -141,14 +140,12 @@ def is_valid_sequence(
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
-    output_too_short = (not skip_min_output_len_check) and (output_len
-                                                            < min_len)
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len

    # Return True if none of the invalid conditions are met
-    return not (prompt_too_short or output_too_short or prompt_too_long
-                or combined_too_long)
+    return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)


 def process_image(image: Any) -> Mapping[str, Any]:
@@ -171,28 +168,25 @@ def process_image(image: Any) -> Mapping[str, Any]:
    Raises:
        ValueError: If the input is not a supported type.
    """
-    if isinstance(image, dict) and 'bytes' in image:
-        image = Image.open(BytesIO(image['bytes']))
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
        }

    if isinstance(image, str):
-        image_url = (image if image.startswith(
-            ("http://", "file://")) else f"file://{image}")
+        image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
        return {"type": "image_url", "image_url": {"url": image_url}}

-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
-                     " or str or dictionary with raw image bytes.")
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
+    )


 class EBDataset(BenchmarkDataset):
@@ -219,6 +213,10 @@ class EBDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]

+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
    def sample(
        self,
        num_requests: int,
@@ -229,6 +227,7 @@ class EBDataset(BenchmarkDataset):
        **kwargs,
    ) -> list:
        samples: list = []
+        cnt = 1
        for entry in self.data:
            if len(samples) >= num_requests:
                break
@@ -242,15 +241,17 @@ class EBDataset(BenchmarkDataset):
            new_output_len = int(entry["max_dec_len"])

            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
+                    no=cnt,
                    prompt=prompt,
                    prompt_len=self.prompt_len,
                    history_QA=[],
                    expected_output_len=new_output_len,
-                ))
+                )
+            )
+            cnt += 1

        self.maybe_oversample_requests(samples, num_requests)
        return samples
@@ -261,6 +262,7 @@ class EBChatDataset(BenchmarkDataset):
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """
+
    prompt_len: int

    def __init__(self, **kwargs) -> None:
@@ -274,6 +276,10 @@ class EBChatDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]

+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
    def sample(
        self,
        num_requests: int,
@@ -284,6 +290,7 @@ class EBChatDataset(BenchmarkDataset):
        **kwargs,
    ) -> list:
        samples: list = []
+        cnt = 1
        for entry in self.data:
            if len(samples) >= num_requests:
                break
@@ -293,17 +300,18 @@ class EBChatDataset(BenchmarkDataset):
            new_output_len = int(entry.get("max_tokens", 12288))

            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
+                    no=cnt,
                    json_data=json_data,
                    prompt=prompt,
                    prompt_len=0,
                    history_QA=history_QA,
                    expected_output_len=new_output_len,
-                ))
+                )
+            )
+            cnt += 1

        self.maybe_oversample_requests(samples, num_requests)
        return samples
-
--- a/benchmarks/benchmark_mtp.py
+++ b/benchmarks/benchmark_mtp.py
@@ -0,0 +1,178 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import argparse
+import asyncio
+import contextlib
+import os
+from typing import Union
+
+from benchmark_dataset import EBChatDataset, EBDataset
+from benchmark_serving import benchmark
+
+
+def prepare_input_requests(num_prompts: int, dataset_name: str, dataset_path: str) -> Union[EBDataset, EBChatDataset]:
+    dataset_mapping = {
+        "EB": lambda: EBDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
+        "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
+    }
+
+    try:
+        input_requests = dataset_mapping[dataset_name]()
+    except KeyError as err:
+        raise ValueError(f"Unknown dataset: {dataset_name}") from err
+
+    return input_requests
+
+
+class FakeTokenizer:
+    def encode(self, text: str, add_special_tokens: bool = False):
+        return []
+
+
+def send_one_batch(base_url, max_concurrency, input_requests, disable_tqdm):
+    selected_percentile_metrics = ["s_itl"]
+    selected_percentiles = []
+    # Run benchmark
+    results = asyncio.run(
+        benchmark(
+            backend="openai-chat",
+            api_url=f"{base_url}/v1/chat/completions",
+            base_url=base_url,
+            model_id="default",
+            model_name="default",
+            input_requests=input_requests,
+            hyper_parameters={},
+            logprobs=None,
+            request_rate=float("inf"),
+            burstiness=1.0,
+            disable_tqdm=disable_tqdm,
+            profile=False,
+            selected_percentile_metrics=selected_percentile_metrics,
+            selected_percentiles=selected_percentiles,
+            ignore_eos=False,
+            goodput_config_dict=None,
+            max_concurrency=max_concurrency,
+            lora_modules=None,
+            extra_body=None,
+        )
+    )
+
+    record = {
+        "mean_s_itl_ms": results["mean_s_itl_ms"],
+    }
+
+    return record
+
+
+def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp):
+
+    tmp = 0.0
+    for i in range(draft_token_step):
+        tmp += pow(acceptance_rate, i + 1)
+
+    r_ac = tmp / (1 + tmp)
+
+    return t_ori / ((1 - r_ac) * t_mtp)
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+
+    input_requests = prepare_input_requests(args.num_prompts, args.dataset_name, args.dataset_path)
+
+    if len(args.max_concurrency) != len(args.s_itl_base_model):
+        raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
+
+    for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
+        # Wramup
+        print("Starting warmup...")
+        with open(os.devnull, "w") as f:
+            with contextlib.redirect_stdout(f):
+                send_one_batch(
+                    base_url,
+                    max_concurrency,
+                    input_requests[0:max_concurrency],
+                    True,
+                )
+
+        # Benchmark
+        record = send_one_batch(base_url, max_concurrency, input_requests, False)
+
+        metric_header = "Speed up"
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        for draft_token_step in args.draft_token_steps:
+            speedup = calculate_speedup(
+                args.acceptance_rate,
+                draft_token_step,
+                s_itl,
+                record["mean_s_itl_ms"],
+            )
+            print("{:<40} {:<10.2f}".format(f"Speed up on {draft_token_step} steps draft", speedup))
+        print("=" * 50)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+    )
+    parser.add_argument(
+        "--port",
+        type=str,
+        default="8000",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        nargs="+",
+        default=(1, 2, 4, 8, 16, 32),
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=128,
+    )
+    parser.add_argument(
+        "--acceptance-rate",
+        type=float,
+        default=0.8,
+    )
+    parser.add_argument(
+        "--draft-token-steps",
+        type=int,
+        nargs="+",
+        default=(1, 2),
+    )
+    parser.add_argument(
+        "--s_itl-base-model",
+        type=float,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="EBChat",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+    )
+    args = parser.parse_args()
+
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -24,9 +24,11 @@ import os
 from typing import Any


-def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: dict[str, list],
-                                        extra_info: dict[str, Any]) -> list:
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace,
+    metrics: dict[str, list],
+    extra_info: dict[str, Any],
+) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
@@ -54,12 +56,10 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
            },
        }

-        tp = record["benchmark"]["extra_info"]["args"].get(
-            "tensor_parallel_size")
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"][
-                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"]

        records.append(record)

@@ -68,6 +68,7 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,

 class InfEncoder(json.JSONEncoder):
    """InfEncoder"""
+
    def clear_inf(self, o: Any):
        """clear_inf"""
        if isinstance(o, dict):
@@ -87,4 +88,3 @@ def write_to_json(filename: str, records: list) -> None:
    """write_to_json"""
    with open(filename, "w") as f:
        json.dump(records, f, cls=InfEncoder)
-
--- a/benchmarks/quick_benchmark.py
+++ b/benchmarks/quick_benchmark.py
@@ -25,32 +25,32 @@ import os
 import random
 import time
 import warnings
-import yaml
-import requests
-import copy
+from argparse import ArgumentParser as FlexibleArgumentParser
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional

 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
-                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
-                                  RequestFuncOutput)
+import requests
+import yaml
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm.asyncio import tqdm

-from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000


-
@dataclass
 class BenchmarkMetrics:
    """Class containing all metrics that are used in this script"""
+
    completed: int
    total_input: int
    total_output: int
@@ -133,8 +133,7 @@ async def get_request(
    input_requests: Iterable[SampleRequest] = iter(input_requests)

    # Calculate scale parameter theta to maintain the desired request_rate.
-    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+    assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}."
    theta = 1.0 / (request_rate * burstiness)

    for request in input_requests:
@@ -160,7 +159,7 @@ def calculate_metrics(
 ) -> tuple[BenchmarkMetrics, list[int]]:
    """Calculates various performance metrics based on the inputs and outputs."""
    input_lens: list[int] = []
-    infer_input_lens: list[int] = [] # 推理侧输入token数
+    infer_input_lens: list[int] = []  # 推理侧输入token数
    actual_output_lens: list[int] = []
    total_input = 0
    completed = 0
@@ -210,8 +209,9 @@ def calculate_metrics(
            s_e2els.append(outputs[i].arrival_time[-1])
            # 解码速度去掉首token
            if len(outputs[i].arrival_time) > 2:
-                s_decodes.append((outputs[i].output_tokens - 1) /
-                                 (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]))
+                s_decodes.append(
+                    (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])
+                )
            completed += 1
        else:
            actual_output_lens.append(0)
@@ -224,16 +224,13 @@ def calculate_metrics(

        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)

        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -242,9 +239,9 @@ def calculate_metrics(

    if completed == 0:
        warnings.warn(
-            "All requests failed. This is likely due to a misconfiguration "
-            "on the benchmark arguments.",
-            stacklevel=2)
+            "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
+            stacklevel=2,
+        )
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@@ -253,64 +250,50 @@ def calculate_metrics(
        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_s_decode=np.mean(s_decodes or 0) *
-                       1,  # ttfts is empty if streaming is not supported by backend
+        mean_s_decode=np.mean(s_decodes or 0) * 1,  # ttfts is empty if streaming is not supported by backend
        std_s_decode=np.std(s_decodes or 0) * 1,
        median_s_decode=np.median(s_decodes or 0) * 1,
-        percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1)
-                               for p in selected_percentiles],
-        mean_ttft_ms=np.mean(ttfts or 0) *
-        1000,  # ttfts is empty if streaming is not supported by backend
+        percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles],
+        mean_ttft_ms=np.mean(ttfts or 0) * 1000,  # ttfts is empty if streaming is not supported by backend
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
-        mean_s_ttft_ms=np.mean(s_ttfts or 0) *
-                     1000,  # ttfts is empty if streaming is not supported by backend
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
+        mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000,  # ttfts is empty if streaming is not supported by backend
        std_s_ttft_ms=np.std(s_ttfts or 0) * 1000,
        median_s_ttft_ms=np.median(s_ttfts or 0) * 1000,
-        percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles],
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
        mean_itl_ms=np.mean(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
-                            for p in selected_percentiles],
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
        mean_s_itl_ms=np.mean(s_itls or 0) * 1000,
        std_s_itl_ms=np.std(s_itls or 0) * 1000,
        median_s_itl_ms=np.median(s_itls or 0) * 1000,
-        percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000)
-                            for p in selected_percentiles],
+        percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles],
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
        mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000,
        std_s_e2el_ms=np.std(s_e2els or 0) * 1000,
        median_s_e2el_ms=np.median(s_e2els or 0) * 1000,
-        percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles],
        mean_input_len=np.mean(input_lens or 0) * 1,
        std_input_len=np.std(input_lens or 0) * 1,
        median_input_len=np.median(input_lens or 0) * 1,
-        percentiles_input_len=[(p, np.percentile(input_lens or 0, p))
-                             for p in selected_percentiles],
+        percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles],
        mean_s_input_len=np.mean(infer_input_lens or 0) * 1,
        std_s_input_len=np.std(infer_input_lens or 0) * 1,
        median_s_input_len=np.median(infer_input_lens or 0) * 1,
-        percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p))
-                               for p in selected_percentiles],
+        percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles],
        mean_output_len=np.mean(actual_output_lens or 0) * 1,
        std_output_len=np.std(actual_output_lens or 0) * 1,
        median_output_len=np.median(actual_output_lens or 0) * 1,
-        percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p))
-                               for p in selected_percentiles],
+        percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles],
    )

    return metrics, actual_output_lens
@@ -351,20 +334,22 @@ async def benchmark(

    if lora_modules:
        # For each input request, choose a LoRA module at random.
-        lora_modules = iter(
-            [random.choice(lora_modules) \
-                for _ in range(len(input_requests))])
+        lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])

    if profile:
        print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
-                                         model_name=model_name,
-                                         prompt=test_prompt,
-                                         api_url=base_url + "/start_profile",
-                                         output_len=test_output_len,
-                                         logprobs=logprobs,
-                                         ignore_eos=ignore_eos,
-                                         extra_body=extra_body)
+        test_prompt = None
+        test_output_len = None
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            output_len=test_output_len,
+            logprobs=logprobs,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@@ -384,19 +369,16 @@ async def benchmark(
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
-                 if max_concurrency else None)
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None

    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
-            return await request_func(request_func_input=request_func_input,
-                                      pbar=pbar)
+            return await request_func(request_func_input=request_func_input, pbar=pbar)

    benchmark_start_time = time.perf_counter()
-    
+
    print(f"开始时间：{datetime.now()}")
    tasks: list[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate, burstiness):
@@ -409,25 +391,26 @@ async def benchmark(
            req_lora_module = next(lora_modules)
            req_model_id, req_model_name = req_lora_module, req_lora_module

-        request_func_input = RequestFuncInput(model=req_model_id,
-                                              model_name=req_model_name,
-                                              prompt=prompt,
-                                              prompt_len=0,
-                                              history_QA=history_QA,
-                                              hyper_parameters=hyper_parameters,
-                                              api_url=api_url,
-                                              output_len=output_len,
-                                              logprobs=logprobs,
-                                              ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
-        tasks.append(
-            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
-                                     pbar=pbar)))
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            prompt_len=0,
+            history_QA=history_QA,
+            hyper_parameters=hyper_parameters,
+            api_url=api_url,
+            output_len=output_len,
+            logprobs=logprobs,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    print(f"完成时间：{datetime.now()}")
    if profile:
        print("Stopping profiler...")
+        test_output_len = None
+        test_output_len = None
        profile_input = RequestFuncInput(
            model=model_id,
            prompt=test_prompt,
@@ -454,22 +437,16 @@ async def benchmark(
    )
    print("Benchmark complete!!!")

-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
-    print("{:<40} {:<10.3f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput))
    if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
-                                        metrics.request_goodput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
-                                    metrics.total_token_throughput))
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))

    result = {
        "duration": benchmark_duration,
@@ -477,8 +454,7 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
-        "request_goodput:":
-        metrics.request_goodput if goodput_config_dict else None,
+        "request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@@ -491,7 +467,6 @@ async def benchmark(
        "reasoning_contents": [output.reasoning_content for output in outputs],
        "errors": [output.error for output in outputs],
    }
-    quick_result = copy.deepcopy(result)

    def process_one_metric(
        # E.g., "ttft"
@@ -505,24 +480,25 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name} (ms):",
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name} (ms):",
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
-        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
-        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
-        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}_ms"):
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
-                                            value))
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value

    def process_one_length(
@@ -537,31 +513,31 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name}:",
-            getattr(metrics, f"mean_{metric_attribute_name}")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name}:",
-            getattr(metrics, f"median_{metric_attribute_name}")))
-        result[f"mean_{metric_attribute_name}"] = getattr(
-            metrics, f"mean_{metric_attribute_name}")
-        result[f"median_{metric_attribute_name}"] = getattr(
-            metrics, f"median_{metric_attribute_name}")
-        result[f"std_{metric_attribute_name}"] = getattr(
-            metrics, f"std_{metric_attribute_name}")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}"):
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name}:",
+                getattr(metrics, f"mean_{metric_attribute_name}"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name}:",
+                getattr(metrics, f"median_{metric_attribute_name}"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
+        result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
+        result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
-                                            value))
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
            result[f"p{p_word}_{metric_attribute_name}"] = value

    process_one_length("s_decode", "Decode", "解码速度(tok/s)")
    process_one_metric("ttft", "TTFT", "Time to First Token")
    process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
    process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -581,6 +557,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
    """
    快速评估
    """
+
    def process_quick_metric(
        metric_attribute_name: str,
        metric_name: str,
@@ -588,7 +565,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
    ):
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
        mean_value = getattr(metrics, f"mean_{metric_attribute_name}_ms")
        print("{:<40} {:<10.2f}".format(f"Mean {metric_name} (ms):", mean_value))
        quick_result[f"mean_{metric_attribute_name}_ms"] = mean_value
@@ -600,17 +577,17 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
    ):
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
        mean_value = getattr(metrics, f"mean_{metric_attribute_name}")
        print("{:<40} {:<10.2f}".format(f"Mean {metric_name}:", mean_value))
        quick_result[f"mean_{metric_attribute_name}"] = mean_value
+
    print("\n\n\n")
-    print("{s:{c}^{n}}".format(s=' Benchmark Quick Summary ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Benchmark Quick Summary ", n=50, c="="))
    process_quick_length("s_decode", "Decode", "解码速度(tok/s)")
    process_quick_metric("ttft", "TTFT", "Time to First Token")
    process_quick_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
-    process_quick_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
+    process_quick_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
    process_quick_metric("itl", "ITL", "Inter-token Latency")
    process_quick_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
    process_quick_metric("e2el", "E2EL", "End-to-end Latency")
@@ -633,12 +610,14 @@ def check_goodput_args(args):
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{VALID_NAMES!s}. "
+                )
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
    return goodput_config_dict


@@ -652,37 +631,43 @@ def parse_goodput(slo_pairs):
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
    return goodput_config_dict


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any],
-                                     file_name: str) -> None:
+def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None:
    """Save the benchmarking results to PyTorch Benchmark Format JSON file"""
    metrics = [
-        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
-        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
-        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
    ]
    # These raw data might be useful, but they are rather big. They can be added
    # later if needed
    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
-        metrics={k: [results[k]]
-                 for k in metrics},
-        extra_info={
-            k: results[k]
-            for k in results if k not in metrics and k not in ignored_metrics
-        })
+        metrics={k: [results[k]] for k in metrics},
+        extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics},
+    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)

+
 def check_health(api_base_url: str) -> bool:
    health_url = api_base_url.rstrip("/") + "/health"
    try:
@@ -697,6 +682,7 @@ def check_health(api_base_url: str) -> bool:
        print(f"[HEALTH] Failed to connect to {health_url}: {e}")
        return False

+
 def main(args: argparse.Namespace):
    """Main entry point"""
    print(args)
@@ -707,7 +693,6 @@ def main(args: argparse.Namespace):
    model_id = args.model
    model_name = args.served_model_name
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
-    tokenizer_mode = args.tokenizer_mode

    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
@@ -717,23 +702,17 @@ def main(args: argparse.Namespace):
        base_url = f"http://{args.host}:{args.port}"

    if args.dataset_name is None:
-        raise ValueError(
-            "Please specify '--dataset-name' and the corresponding "
-            "'--dataset-path' if required.")
+        raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.")

    # For datasets that follow a similar structure, use a mapping.
    dataset_mapping = {
-        "EB":
-        lambda: EBDataset(random_seed=args.seed,
-                          dataset_path=args.dataset_path).sample(
-                              num_requests=args.num_prompts,
-                              output_len=args.sharegpt_output_len,
+        "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+            num_requests=args.num_prompts,
+            output_len=args.sharegpt_output_len,
        ),
-        "EBChat":
-            lambda: EBChatDataset(random_seed=args.seed,
-                                  dataset_path=args.dataset_path).sample(
-                num_requests=args.num_prompts,
-                output_len=args.sharegpt_output_len,
+        "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+            num_requests=args.num_prompts,
+            output_len=args.sharegpt_output_len,
        ),
    }

@@ -751,15 +730,14 @@ def main(args: argparse.Namespace):
            "top_p": args.top_p,
            "top_k": args.top_k,
            "min_p": args.min_p,
-            "temperature": args.temperature
-        }.items() if v is not None
+            "temperature": args.temperature,
+        }.items()
+        if v is not None
    }

    # Sampling parameters are only supported by openai-compatible backend.
    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible "
-            "backends.")
+        raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.")

    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
@@ -790,15 +768,14 @@ def main(args: argparse.Namespace):
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
            ignore_eos=args.ignore_eos,
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
-        ))
+        )
+    )

    # Save config and results to json
    if args.save_result:
@@ -819,22 +796,23 @@ def main(args: argparse.Namespace):
                    kvstring = item.split("=")
                    result_json[kvstring[0].strip()] = kvstring[1].strip()
                else:
-                    raise ValueError(
-                        "Invalid metadata format. Please use KEY=VALUE format."
-                    )
+                    raise ValueError("Invalid metadata format. Please use KEY=VALUE format.")

        if not args.save_detailed:
            # Remove fields with too many data points
            for field in [
-                    "input_lens", "output_lens", "ttfts", "itls",
-                    "generated_texts", "errors"
+                "input_lens",
+                "output_lens",
+                "ttfts",
+                "itls",
+                "generated_texts",
+                "errors",
            ]:
                if field in result_json:
                    del result_json[field]

        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
+        result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf"
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency

@@ -843,21 +821,19 @@ def main(args: argparse.Namespace):

        # Save to file
        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
-                               if args.max_concurrency is not None else "")
-        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
+        max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else ""
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name, "w", encoding="utf-8") as outfile:
            json.dump(result_json, outfile)
        save_to_pytorch_benchmark_format(args, result_json, file_name)


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput.")
+    parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.")
    parser.add_argument(
        "--backend",
        type=str,
@@ -883,18 +859,29 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"],
+        choices=[
+            "sharegpt",
+            "burstgpt",
+            "sonnet",
+            "random",
+            "hf",
+            "EB",
+            "EBChat",
+        ],
        help="Name of the dataset to benchmark on.",
    )
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the sharegpt/sonnet dataset. "
-                        "Or the huggingface dataset ID if using HF dataset.")
-    parser.add_argument("--hyperparameter-path",
-                        type=str,
-                        default=None,
-                        help="Path to the hyperparameter. ")
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
+    )
+    parser.add_argument(
+        "--hyperparameter-path",
+        type=str,
+        default=None,
+        help="Path to the hyperparameter. ",
+    )
    parser.add_argument(
        "--max-concurrency",
        type=int,
@@ -906,7 +893,8 @@ if __name__ == "__main__":
        "initiated, this argument will control how many are actually allowed "
        "to execute at a time. This means that when used in combination, the "
        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )

    parser.add_argument(
        "--model",
@@ -917,7 +905,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
@@ -930,11 +918,13 @@ if __name__ == "__main__":
        "--logprobs",
        type=int,
        default=None,
-        help=("Number of logprobs-per-token to compute & return as part of "
-              "the request. If unspecified, then either (1) if beam search "
-              "is disabled, no logprobs are computed & a single dummy "
-              "logprob is returned for each token; or (2) if beam search "
-              "is enabled 1 logprob per token is computed"),
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
    )
    parser.add_argument(
        "--request-rate",
@@ -971,8 +961,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--profile",
        action="store_true",
-        help="Use Torch Profiler. The endpoint must be launched with "
-        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+        help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.",
    )
    parser.add_argument(
        "--save-result",
@@ -1013,35 +1002,38 @@ if __name__ == "__main__":
        "--ignore-eos",
        action="store_true",
        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
    parser.add_argument(
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-separated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
-        "Use \"--percentile-metrics\" to select metrics.",
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
    )
    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
        "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
        "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )

    # group for dataset specific arguments
    sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -1069,8 +1061,8 @@ if __name__ == "__main__":
        "--sharegpt-output-len",
        type=int,
        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.")
+        help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
+    )

    random_group = parser.add_argument_group("random dataset options")
    random_group.add_argument(
@@ -1098,29 +1090,24 @@ if __name__ == "__main__":
        "--random-prefix-len",
        type=int,
        default=0,
-        help=("Number of fixed prefix tokens before the random context "
-              "in a request. "
-              "The total input length is the sum of `random-prefix-len` and "
-              "a random "
-              "context length sampled from [input_len * (1 - range_ratio), "
-              "input_len * (1 + range_ratio)]."),
+        help=(
+            "Number of fixed prefix tokens before the random context "
+            "in a request. "
+            "The total input length is the sum of `random-prefix-len` and "
+            "a random "
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
    )

    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument("--hf-subset",
-                          type=str,
-                          default=None,
-                          help="Subset of the HF dataset.")
-    hf_group.add_argument("--hf-split",
-                          type=str,
-                          default=None,
-                          help="Split of the HF dataset.")
+    hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
    hf_group.add_argument(
        "--hf-output-len",
        type=int,
        default=None,
-        help="Output length for each request. Overrides the output lengths "
-        "from the sampled HF dataset.",
+        help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
    )

    sampling_group = parser.add_argument_group("sampling parameters")
@@ -1128,52 +1115,58 @@ if __name__ == "__main__":
        "--top-p",
        type=float,
        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
+        help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.",
+    )
    sampling_group.add_argument(
        "--top-k",
        type=int,
        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible "
-        "backends.")
+        help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.",
+    )
    sampling_group.add_argument(
        "--min-p",
        type=float,
        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
+        help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.",
+    )
    sampling_group.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Temperature sampling parameter. Only has effect on "
        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).")
+        "decoding (i.e. temperature==0.0).",
+    )

    parser.add_argument(
-        '--tokenizer-mode',
+        "--tokenizer-mode",
        type=str,
        default="auto",
-        choices=['auto', 'slow', 'mistral', 'custom'],
+        choices=["auto", "slow", "mistral", "custom"],
        help='The tokenizer mode.\n\n* "auto" will use the '
        'fast tokenizer if available.\n* "slow" will '
-        'always use the slow tokenizer. \n* '
+        "always use the slow tokenizer. \n* "
        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.')
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )

-    parser.add_argument("--served-model-name",
-                        type=str,
-                        default=None,
-                        help="The model name used in the API. "
-                        "If not specified, the model name will be the "
-                        "same as the ``--model`` argument. ")
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )

-    parser.add_argument("--lora-modules",
-                        nargs='+',
-                        default=None,
-                        help="A subset of LoRA module names passed in when "
-                        "launching the server. For each request, the "
-                        "script chooses a LoRA module at random.")
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )

    args = parser.parse_args()

--- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
@@ -7,4 +7,4 @@ tensor_parallel_size: 1
 enable_chunked_prefill: True
 max_num_batched_tokens: 384
 quantization: wint4
-reasoning_parser: ernie-45-vl
+reasoning_parser: ernie-45-vl
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
 pd_comm_port: "2334"
 max_num_batched_tokens: 384
 max_num_partial_prefills: 3
-max_long_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-a800-tp4.yaml
@@ -3,3 +3,4 @@ max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
+quantization: wint4
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
@@ -10,4 +10,4 @@ engine_worker_queue_port: 6677
 num_gpu_blocks_override: 1024
 cache_transfer_protocol: "rdma"
 rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
-pd_comm_port: "2334"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
@@ -10,4 +10,4 @@ splitwise_role: decode
 engine_worker_queue_port: 6678
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7671,7672,7673,7674"
-pd_comm_port: "2334"
+pd_comm_port: "2334"
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
 pd_comm_port: "2334"
 max_num_batched_tokens: 384
 max_num_partial_prefills: 3
-max_long_partial_prefills: 3
+max_long_partial_prefills: 3
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
 engine_worker_queue_port: 6677
 cache_transfer_protocol: "rdma,ipc"
 rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
+pd_comm_port: "2333"
--- a/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-32k-wint8-a800-tp8.yaml
@@ -3,3 +3,4 @@ max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 8
+quantization: wint8
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-a30-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-bf16-h800-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-a30-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_0dot3b-32k-wint8-h800-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-bf16-h800-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/eb45t_21b-32k-wint4-h800-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint4
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
+++ b/benchmarks/yaml/eb45t_300b-32k-wint4-h800-tp4-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 96
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.71
 tensor_parallel_size: 4
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-a30-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-bf16-h800-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen2_7b-32k-fp8-h800-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wfp8afp8
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-a30-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-bf16-h800-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-a30-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_0dot6b-32k-wint8-h800-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint8
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-bf16-h800-tp1-static.yaml
@@ -2,4 +2,5 @@ max_model_len: 32768
 max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
+++ b/benchmarks/yaml/qwen3_30b-32k-wint4-h800-tp1-static.yaml
@@ -3,4 +3,5 @@ max_num_seqs: 128
 kv_cache_ratio: 0.75
 tensor_parallel_size: 1
 quantization: wint4
-enable_static_graph_inference: True
+graph_optimization_config:
+  graph_opt_level: 1
--- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 75
 gpu_memory_utilization: 0.85
 kv_cache_ratio: 0.75
 quantization: wint4
-tensor_parallel_size: 4
+tensor_parallel_size: 4
--- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 25
 gpu_memory_utilization: 0.9
 kv_cache_ratio: 0.75
 quantization: wint8
-tensor_parallel_size: 4
+tensor_parallel_size: 4
--- a/benchmarks/yaml/request_yaml/quick_benchmark.yaml
+++ b/benchmarks/yaml/request_yaml/quick_benchmark.yaml
@@ -1,3 +1,3 @@
 metadata:
  min_tokens: 32
-max_tokens: 33
+max_tokens: 33
--- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
@@ -5,4 +5,4 @@ metadata:
 max_tokens: 12288
 repetition_penalty: 1.05
 frequency_penalty: 0
-presence_penalty: 0
+presence_penalty: 0
--- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
@@ -5,4 +5,4 @@ metadata:
 max_tokens: 12288
 repetition_penalty: 1.0
 frequency_penalty: 0
-presence_penalty: 1.5
+presence_penalty: 1.5
--- a/benchmarks/yaml/request_yaml/vLLM_default.yaml
+++ b/benchmarks/yaml/request_yaml/vLLM_default.yaml
@@ -0,0 +1,11 @@
+top_p: 1.0
+temperature: 1.0
+metadata:
+  min_tokens: 1
+max_tokens: 30721
+repetition_penalty: 1.0
+frequency_penalty: 0
+presence_penalty: 0
+skip_special_tokens: false
+chat_template_kwargs:
+  enable_thinking: true
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 64
 gpu_memory_utilization: 0.9
 tensor_parallel_size: 8
 quantization: wint8
-reasoning_parser: ernie-x1
+reasoning_parser: ernie-x1
--- a/build.sh
+++ b/build.sh
@@ -18,6 +18,9 @@ BUILD_WHEEL=${1:-1}
 PYTHON_VERSION=${2:-"python"}
 export python=$PYTHON_VERSION
 FD_CPU_USE_BF16=${3:-"false"}
+# FD_BUILDING_ARCS: Specify target CUDA architectures for custom ops, e.g., "[80, 90, 100]".
+# For SM90 (Hopper), use 90. For SM100 (Blackwell), use 100.
+# These will be translated to 90a / 100a in setup_ops.py for specific features.
 FD_BUILDING_ARCS=${4:-""}


@@ -74,8 +77,10 @@ function copy_ops(){
    is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
    if [ "$is_rocm" = "True" ]; then
      DEVICE_TYPE="rocm"
+      mkdir -p ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
-      echo -e "ROCM ops have been copy to fastdeploy"
+      echo -e "BASE and ROCM ops have been copy to fastdeploy"
      return
    fi
    mkdir -p ../fastdeploy/model_executor/ops/base
@@ -104,6 +109,23 @@ function copy_ops(){
      return
    fi

+    if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
+    if [ "$if_corex" = "True" ]; then
+      DEVICE_TYPE="iluvatar-gpu"
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
+      echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
+      return
+    fi
+
+    is_gcu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('gcu'))"`
+    if [ "$is_gcu" = "True" ]; then
+      DEVICE_TYPE="gcu"
+      cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gcu
+      echo -e "gcu ops have been copy to fastdeploy"
+      return
+    fi
+
    DEVICE_TYPE="cpu"
    cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
    cd ../../../../
@@ -163,17 +185,6 @@ function build_and_install() {
    exit 1
  fi
  echo -e "${BLUE}[build]${NONE} ${GREEN}build fastdeploy wheel success${NONE}\n"
-
-  echo -e "${BLUE}[install]${NONE} installing fastdeploy..."
-  cd $DIST_DIR
-  find . -name "fastdeploy*.whl" | xargs ${python} -m pip install --force-reinstall --no-cache-dir
-  if [ $? -ne 0 ]; then
-    cd ..
-    echo -e "${RED}[FAIL]${NONE} install fastdeploy wheel failed"
-    exit 1
-  fi
-  echo -e "${BLUE}[install]${NONE} ${GREEN}fastdeploy install success${NONE}\n"
-  cd ..
 }

 function version_info() {
@@ -181,7 +192,10 @@ function version_info() {
  fastdeploy_git_commit_id=$(git rev-parse HEAD)
  paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
  paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
-  cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
+  cuda_version="nvcc-not-installed"
+  if command -v nvcc &> /dev/null; then
+    cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
+  fi
  cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")

  echo "fastdeploy GIT COMMIT ID: $fastdeploy_git_commit_id" > $output_file
--- a/custom_ops/gpu_ops/append_attention.cu
+++ b/custom_ops/gpu_ops/append_attention.cu
@@ -46,8 +46,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& encoder_batch_ids,
    const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -165,8 +165,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
          seq_lens_this_time,
          seq_lens_decoder,
          seq_lens_encoder,
-          padding_offsets,
-          cum_offsets,
+          batch_id_per_token,
+          cu_seqlens_q,
          block_tables,
          lambda_batch_ids,
          lambda_tile_ids_per_batch,
@@ -202,8 +202,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
          seq_lens_this_time,
          seq_lens_encoder,
          seq_lens_decoder,
-          padding_offsets,
-          cum_offsets,
+          batch_id_per_token,
+          cu_seqlens_q,
          block_tables,
          kv_batch_ids,
          kv_tile_ids_per_batch,
@@ -274,8 +274,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            qkv,  // [token_num, num_heads, head_dim]
            seq_lens_decoder,
            seq_lens_encoder,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            block_tables,
            rotary_embs,
            qkv_out_scales,
@@ -297,8 +297,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            qkv_out,  // [token_num, num_heads, head_dim]
            seq_lens_decoder,
            seq_lens_encoder,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            block_tables,
            rotary_embs,
            qkv_out_scales,
@@ -322,8 +322,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            qkv,  // [token_num, num_heads, head_dim]
            seq_lens_decoder,
            seq_lens_encoder,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            block_tables,
            rotary_embs,
            qkv_out_scales,
@@ -346,8 +346,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            qkv_out,  // [token_num, num_heads, head_dim]
            seq_lens_decoder,
            seq_lens_encoder,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            block_tables,
            rotary_embs,
            qkv_out_scales,
@@ -403,8 +403,8 @@ std::vector<paddle::Tensor> AppendAttention(
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
    const paddle::Tensor& seq_lens_this_time,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& encoder_batch_ids,
    const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -462,7 +462,7 @@ std::vector<paddle::Tensor> AppendAttention(

  meta_data.max_blocks_per_seq = block_tables.dims()[1];
  meta_data.block_size = key_cache.dims()[2];
-  meta_data.batch_size = cum_offsets.dims()[0];
+  meta_data.batch_size = seq_lens_this_time.dims()[0];

  auto dispatch_by_template = [&](auto temp_args) -> std::vector<paddle::Tensor> {
      return AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
@@ -473,8 +473,8 @@ std::vector<paddle::Tensor> AppendAttention(
          seq_lens_encoder,
          seq_lens_decoder,
          seq_lens_this_time,
-          padding_offsets,
-          cum_offsets,
+          batch_id_per_token,
+          cu_seqlens_q,
          block_tables,
          encoder_batch_ids,
          encoder_tile_ids_per_batch,
@@ -550,8 +550,8 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
    const std::vector<int64_t>& seq_lens_encoder_shape,
    const std::vector<int64_t>& seq_lens_decoder_shape,
    const std::vector<int64_t>& seq_lens_this_time_shape,
-    const std::vector<int64_t>& padding_offsets_shape,
-    const std::vector<int64_t>& cum_offsets_shape,
+    const std::vector<int64_t>& batch_id_per_token_shape,
+    const std::vector<int64_t>& cu_seqlens_q_shape,
    const std::vector<int64_t>& block_tables_shape,
    const std::vector<int64_t>& encoder_batch_ids_shape,
    const std::vector<int64_t>& encoder_tile_ids_per_batch_shape,
@@ -610,8 +610,8 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
    const paddle::DataType& seq_lens_encoder_dtype,
    const paddle::DataType& seq_lens_decoder_dtype,
    const paddle::DataType& seq_lens_this_time_dtype,
-    const paddle::DataType& padding_offsets_dtype,
-    const paddle::DataType& cum_offsets_dtype,
+    const paddle::DataType& batch_id_per_token_dtype,
+    const paddle::DataType& cu_seqlens_q_dtype,
    const paddle::DataType& block_tables_dtype,
    const paddle::DataType& encoder_batch_ids_dtype,
    const paddle::DataType& encoder_tile_ids_per_batch_dtype,
@@ -688,8 +688,8 @@ PD_BUILD_STATIC_OP(append_attention)
             "seq_lens_encoder",
             "seq_lens_decoder",
             "seq_lens_this_time",
-             "padding_offsets",
-             "cum_offsets",
+             "batch_id_per_token",
+             "cu_seqlens_q",
             "block_tables",
             "encoder_batch_ids",
             "encoder_tile_ids_per_batch",
--- a/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
@@ -41,7 +41,7 @@ __global__ void multi_query_append_attention_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -114,8 +114,7 @@ __global__ void multi_query_append_attention_kernel(
  const uint32_t kv_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_b_stride = HEAD_DIM;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block =
      (tile_id * NUM_WARPS + wid) * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -405,7 +404,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -477,8 +476,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
  const uint32_t kv_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_b_stride = HEAD_DIM;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
                            q_head_idx * HEAD_DIM +
@@ -775,8 +773,8 @@ void MultiQueryAppendAttention(
    const paddle::Tensor &seq_lens_q,
    const paddle::Tensor &seq_lens_kv,
    const paddle::Tensor &seq_lens_encoder,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    const paddle::Tensor &batch_ids,
    const paddle::Tensor &tile_ids_per_batch,
@@ -882,7 +880,7 @@ void MultiQueryAppendAttention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -939,7 +937,7 @@ void MultiQueryAppendAttention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -974,7 +972,7 @@ void MultiQueryAppendAttention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1009,7 +1007,8 @@ void MultiQueryAppendAttention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1103,7 +1102,7 @@ void MultiQueryAppendAttention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1171,7 +1170,7 @@ void MultiQueryAppendAttention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1207,7 +1206,7 @@ void MultiQueryAppendAttention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1242,7 +1241,8 @@ void MultiQueryAppendAttention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1289,8 +1289,8 @@ void CascadeAppendAttentionC16Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -1352,8 +1352,8 @@ void CascadeAppendAttentionC16Kernel(
                                seq_lens_q,
                                seq_lens_kv,
                                seq_lens_encoder,
-                                padding_offsets,
-                                cum_offsets,
+                                batch_id_per_token,
+                                cu_seqlens_q,
                                block_table,
                                batch_ids,
                                tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c4_impl.cuh
@@ -46,7 +46,7 @@ __global__ void multi_query_append_attention_c4_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -144,8 +144,7 @@ __global__ void multi_query_append_attention_c4_kernel(
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
  const uint32_t kv_b_stride = HEAD_DIM / 2;
  const uint32_t kv_d_stride = BLOCK_SIZE / 2;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block =
      (tile_id * NUM_WARPS + wid) * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -504,7 +503,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -601,8 +600,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
  const uint32_t kv_b_stride = HEAD_DIM / 2;
  const uint32_t kv_d_stride = BLOCK_SIZE / 2;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
                            q_head_idx * HEAD_DIM +
@@ -962,8 +960,8 @@ void MultiQueryAppendC4Attention(
    const paddle::Tensor &seq_lens_q,
    const paddle::Tensor &seq_lens_kv,
    const paddle::Tensor &seq_lens_encoder,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    const paddle::Tensor &batch_ids,
    const paddle::Tensor &tile_ids_per_batch,
@@ -1088,7 +1086,7 @@ void MultiQueryAppendC4Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1151,7 +1149,7 @@ void MultiQueryAppendC4Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1186,7 +1184,7 @@ void MultiQueryAppendC4Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1221,7 +1219,8 @@ void MultiQueryAppendC4Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1333,7 +1332,7 @@ void MultiQueryAppendC4Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1409,7 +1408,7 @@ void MultiQueryAppendC4Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1444,7 +1443,7 @@ void MultiQueryAppendC4Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1479,7 +1478,8 @@ void MultiQueryAppendC4Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1526,8 +1526,8 @@ void CascadeAppendAttentionC4Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -1593,8 +1593,8 @@ void CascadeAppendAttentionC4Kernel(
                                seq_lens_q,
                                seq_lens_kv,
                                seq_lens_encoder,
-                                padding_offsets,
-                                cum_offsets,
+                                batch_id_per_token,
+                                cu_seqlens_q,
                                block_table,
                                batch_ids,
                                tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_c8_impl.cuh
@@ -46,7 +46,7 @@ __global__ void multi_query_append_attention_c8_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -151,8 +151,7 @@ __global__ void multi_query_append_attention_c8_kernel(
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_b_stride = HEAD_DIM;
  const uint32_t kv_d_stride = BLOCK_SIZE;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block =
      (tile_id * NUM_WARPS + wid) * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -473,7 +472,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ batch_ids,
    const int *__restrict__ tile_ids_per_batch,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_table,  // [bsz, block_num_per_seq]
    const int max_seq_len,
    const int max_dec_len,
@@ -575,8 +574,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
  const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
  const uint32_t kv_b_stride = HEAD_DIM;
  const uint32_t kv_d_stride = BLOCK_SIZE;
-  const uint32_t q_start_seq_id =
-      batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
+  const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
  const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
  const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
                            q_head_idx * HEAD_DIM +
@@ -899,8 +897,8 @@ void MultiQueryAppendC8Attention(
    const paddle::Tensor &seq_lens_q,
    const paddle::Tensor &seq_lens_kv,
    const paddle::Tensor &seq_lens_encoder,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    const paddle::Tensor &batch_ids,
    const paddle::Tensor &tile_ids_per_batch,
@@ -1054,7 +1052,7 @@ void MultiQueryAppendC8Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1111,7 +1109,7 @@ void MultiQueryAppendC8Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1146,7 +1144,7 @@ void MultiQueryAppendC8Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1181,7 +1179,8 @@ void MultiQueryAppendC8Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1317,7 +1316,7 @@ void MultiQueryAppendC8Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1387,7 +1386,7 @@ void MultiQueryAppendC8Attention(
          seq_lens_kv.data<int>(),
          batch_ids.data<int>(),
          tile_ids_per_batch.data<int>(),
-          cum_offsets.data<int>(),
+          cu_seqlens_q.data<int>(),
          block_table.data<int>(),
          max_seq_len,
          max_dec_len,
@@ -1417,7 +1416,7 @@ void MultiQueryAppendC8Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                cum_offsets.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1452,7 +1451,8 @@ void MultiQueryAppendC8Attention(
                seq_lens_q.data<int>(),
                seq_lens_kv.data<int>(),
                seq_lens_encoder.data<int>(),
-                padding_offsets.data<int>(),
+                batch_id_per_token.data<int>(),
+                cu_seqlens_q.data<int>(),
                shift_bias ? reinterpret_cast<NV_TYPE *>(
                                 const_cast<T *>(shift_bias.get().data<T>()))
                           : nullptr,
@@ -1499,8 +1499,8 @@ void CascadeAppendAttentionC8Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -1564,8 +1564,8 @@ void CascadeAppendAttentionC8Kernel(
                                seq_lens_q,
                                seq_lens_kv,
                                seq_lens_encoder,
-                                padding_offsets,
-                                cum_offsets,
+                                batch_id_per_token,
+                                cu_seqlens_q,
                                block_table,
                                batch_ids,
                                tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
+++ b/custom_ops/gpu_ops/append_attn/append_attention_func.cuh
@@ -1852,7 +1852,7 @@ __global__ void merge_multi_chunks_kernel(
    const float* __restrict__ multi_d,  // [token_num, num_chunks, num_heads]
    const int* __restrict__ seq_lens_q,
    const int* __restrict__ seq_lens_kv,
-    const int* __restrict__ padding_offsets,
+    const int* __restrict__ batch_id_per_token,
    const T* __restrict__ shift_bias,     // [q_num_heads * HEAD_DIM]
    const T* __restrict__ smooth_weight,  // [q_num_heads * HEAD_DIM]
    T* __restrict__ out,
@@ -1866,8 +1866,7 @@ __global__ void merge_multi_chunks_kernel(
    const int head_dim) {
  const int vid = threadIdx.x, hid = threadIdx.y;
  const int qid = blockIdx.x;
-  const uint32_t ori_token_id = qid + padding_offsets[qid];
-  const uint32_t bid = ori_token_id / max_seq_len;
+  const uint32_t bid = batch_id_per_token[qid];
  if (seq_lens_q[bid] <= 0 || seq_lens_kv[bid] <= 0) {
    return;
  }
@@ -2111,7 +2110,7 @@ __global__ void merge_multi_chunks_decoder_kernel(
    const int *__restrict__ seq_lens_q,
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ seq_lens_encoder,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ cu_seqlens_q,
    const T *__restrict__ shift_bias,     // [q_num_heads * HEAD_DIM]
    const T *__restrict__ smooth_weight,  // [q_num_heads * HEAD_DIM]
    OutT *__restrict__ out,
@@ -2127,7 +2126,7 @@ __global__ void merge_multi_chunks_decoder_kernel(
  const int bid = blockIdx.x, hid = blockIdx.y;
  __shared__ T smem[bdy * HEAD_DIM];
  __shared__ float md_smem[bdy * 2];
-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int start_token_idx = cu_seqlens_q[bid];
  const int seq_len_q = seq_lens_q[bid];
  if (seq_len_q == 0) return;
  int seq_len_kv = seq_lens_kv[bid];
@@ -2240,7 +2239,8 @@ __global__ void merge_multi_chunks_v2_kernel(
    const int *__restrict__ seq_lens_q,
    const int *__restrict__ seq_lens_kv,
    const int *__restrict__ seq_lens_encoder,
-    const int *__restrict__ padding_offsets,
+    const int *__restrict__ batch_id_per_token,
+    const int *__restrict__ cu_seqlens_q,
    const T *__restrict__ shift_bias,     // [q_num_heads * HEAD_DIM]
    const T *__restrict__ smooth_weight,  // [q_num_heads * HEAD_DIM]
    OutT *__restrict__ out,
@@ -2259,9 +2259,8 @@ __global__ void merge_multi_chunks_v2_kernel(
  __shared__ T smem[bdy * HEAD_DIM];
  __shared__ float md_smem[bdy * 2];
  for (int qid = blockIdx.x; qid < token_num; qid += gridDim.x) {
-    const uint32_t ori_token_id = qid + padding_offsets[qid];
-    const uint32_t bid = ori_token_id / max_seq_len;
-    const uint32_t local_seq_id = ori_token_id % max_seq_len;
+    const uint32_t bid = batch_id_per_token[qid];
+    const uint32_t local_seq_id = qid - cu_seqlens_q[bid];
    const int seq_len_q = seq_lens_q[bid];
    if (seq_len_q == 0) continue;
    int seq_len_kv = seq_lens_kv[bid];
--- a/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/append_attention_kernel.h
@@ -40,8 +40,8 @@ void CascadeAppendAttentionC16Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -85,8 +85,8 @@ void CascadeAppendAttentionC8Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -130,8 +130,8 @@ void CascadeAppendAttentionC4Kernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -175,8 +175,8 @@ void CascadeAppendAttentionKernel(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -211,8 +211,8 @@ void CascadeAppendAttentionKernel(
                                                seq_lens_q,
                                                seq_lens_kv,
                                                seq_lens_encoder,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                block_table,
                                                batch_ids,
                                                tile_ids_per_batch,
@@ -246,8 +246,8 @@ void CascadeAppendAttentionKernel(
                                                seq_lens_q,
                                                seq_lens_kv,
                                                seq_lens_encoder,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                block_table,
                                                batch_ids,
                                                tile_ids_per_batch,
@@ -281,8 +281,8 @@ void CascadeAppendAttentionKernel(
                                                seq_lens_q,
                                                seq_lens_kv,
                                                seq_lens_encoder,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                block_table,
                                                batch_ids,
                                                tile_ids_per_batch,
@@ -316,8 +316,8 @@ void CascadeAppendAttentionKernel(
                                                seq_lens_q,
                                                seq_lens_kv,
                                                seq_lens_encoder,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                block_table,
                                                batch_ids,
                                                tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decode_attention_kernel.cu
@@ -35,7 +35,7 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
                                                    const T * __restrict__ multi_d, // [bsz, num_chunks, num_heads]
                                                    const int * __restrict__ seq_lens_q,
                                                    const int * __restrict__ seq_lens_kv,
-                                                    const int * __restrict__ cum_offsets,
+                                                    const int * __restrict__ cu_seqlens_q,
                                                    const T * __restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
                                                    const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
                                                    OutT * __restrict__ out, // [token_num, num_heads, head_dim]
@@ -59,7 +59,7 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
  __shared__ T smem[bdy * HEAD_DIM];
  __shared__ T md_smem[bdy * 2];

-  const int start_token_ids = qid * max_seq_len - __ldg(&cum_offsets[qid]);
+  const int start_token_ids = cu_seqlens_q[qid];
  using LoadT = AlignedVector<T, vec_size>;
  LoadT load_vec;
  LoadT res_vec;
@@ -134,7 +134,7 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
                                                    const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
                                                    const int * __restrict__ seq_lens_q,
                                                    const int * __restrict__ seq_lens_kv,
-                                                    const int * __restrict__ cum_offsets,
+                                                    const int * __restrict__ cu_seqlens_q,
                                                    const int * __restrict__ block_table, // [bsz, block_num_per_seq]
                                                    const int max_seq_len,
                                                    const int max_dec_len,
@@ -171,8 +171,8 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
  }
  kv_len += q_len;
  const uint32_t num_chunk_this_seq = div_up(kv_len, chunk_size);
-  const uint32_t q_start_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
-  const uint32_t q_write_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const uint32_t q_start_idx = cu_seqlens_q[bid];
+  const uint32_t q_write_idx = cu_seqlens_q[bid];
  if (chunk_id >= num_chunk_this_seq) {
    return;
  }
@@ -317,8 +317,8 @@ void MultiQueryDecoderAttention(
  const paddle::optional<paddle::Tensor>& smooth_weight,
  const paddle::Tensor &seq_lens_q,
  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &padding_offsets,
-  const paddle::Tensor &cum_offsets,
+  const paddle::Tensor &batch_id_per_token,
+  const paddle::Tensor &cu_seqlens_q,
  const paddle::Tensor &block_table,
  const int max_seq_len,
  const int max_dec_len,
@@ -393,7 +393,7 @@ void MultiQueryDecoderAttention(
      reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
      seq_lens_q.data<int>(),
      seq_lens_kv.data<int>(),
-      cum_offsets.data<int>(),
+      cu_seqlens_q.data<int>(),
      block_table.data<int>(),
      max_seq_len,
      max_dec_len,
@@ -430,7 +430,7 @@ void MultiQueryDecoderAttention(
      reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
      seq_lens_q.data<int>(),
      seq_lens_kv.data<int>(),
-      cum_offsets.data<int>(),
+      cu_seqlens_q.data<int>(),
      block_table.data<int>(),
      max_seq_len,
      max_dec_len,
@@ -456,7 +456,7 @@ void MultiQueryDecoderAttention(
      reinterpret_cast<NV_TYPE*>(tmp_d->ptr()),
      seq_lens_q.data<int>(),
      seq_lens_kv.data<int>(),
-      cum_offsets.data<int>(),
+      cu_seqlens_q.data<int>(),
      reinterpret_cast<NV_TYPE*>(const_cast<T*>(shift_bias_ptr)),
      reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
      reinterpret_cast<NV_TYPE*>(const_cast<T*>(out->data<T>())),
@@ -483,8 +483,8 @@ void DecodeMLAAttentionKernel(
  const paddle::optional<paddle::Tensor>& smooth_weight,
  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &padding_offsets,
-  const paddle::Tensor &cum_offsets,
+  const paddle::Tensor &batch_id_per_token,
+  const paddle::Tensor &cu_seqlens_q,
  const paddle::Tensor &block_table,
  int max_seq_len,
  int max_dec_len,
@@ -513,7 +513,7 @@ void DecodeMLAAttentionKernel(
          {DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE,
              {DISPATCH_DEAL_EACH_TIME(deal_each_time, DEAL_EACH_TIME,
                  {MultiQueryDecoderAttention<T, GROUP_SIZE, HEAD_DIM_QK, HEAD_DIM_V, BLOCK_SIZE, CAUSAL, 2, 16, DEAL_EACH_TIME>(
-                  meta_data, stream, q, cache_k, cache_v, attn_mask, shift_bias, smooth_weight, seq_lens_q, seq_lens_kv, padding_offsets, cum_offsets,
+                  meta_data, stream, q, cache_k, cache_v, attn_mask, shift_bias, smooth_weight, seq_lens_q, seq_lens_kv, batch_id_per_token, cu_seqlens_q,
                  block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
 }

@@ -527,8 +527,8 @@ template void DecodeMLAAttentionKernel<paddle::bfloat16>(
  const paddle::optional<paddle::Tensor>& smooth_weight,
  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &padding_offsets,
-  const paddle::Tensor &cum_offsets,
+  const paddle::Tensor &batch_id_per_token,
+  const paddle::Tensor &cu_seqlens_q,
  const paddle::Tensor &block_table,
  int max_seq_len,
  int max_dec_len,
@@ -548,8 +548,8 @@ template void DecodeMLAAttentionKernel<paddle::float16>(
  const paddle::optional<paddle::Tensor>& smooth_weight,
  const paddle::Tensor &seq_lens_q, // q_seq_len is 1
  const paddle::Tensor &seq_lens_kv,
-  const paddle::Tensor &padding_offsets,
-  const paddle::Tensor &cum_offsets,
+  const paddle::Tensor &batch_id_per_token,
+  const paddle::Tensor &cu_seqlens_q,
  const paddle::Tensor &block_table,
  int max_seq_len,
  int max_dec_len,
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
@@ -28,8 +28,8 @@ __global__ void append_decode_cache_T_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -65,7 +65,7 @@ __global__ void append_decode_cache_T_rope_kernel(
    const int bias = linear_index % hidden_size;
    const int hi = bias / head_size;  // q + k + v
    const int h_bias = bias % head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    if (seq_lens_encoder[ori_bi] > 0) return;
    const int write_seq_id = seq_lens[ori_bi];
    if (write_seq_id == 0) continue;
@@ -134,8 +134,8 @@ __global__ void append_decode_cache_T_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -177,7 +177,7 @@ __global__ void append_decode_cache_T_rope_kernel(
    const int bias = linear_index % hidden_size;
    const int hi = bias / head_size;  // q + k + v
    const int h_bias = bias % head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    if (seq_lens_encoder[ori_bi] > 0) return;
    const int write_seq_id = seq_lens[ori_bi];
    if (write_seq_id == 0) continue;
@@ -254,8 +254,8 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -293,7 +293,7 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
    const int bias = linear_index % half_hidden_size;
    const int hi = bias / half_head_size;  // q + k + v
    const int h_bias = bias % half_head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    if (seq_lens_encoder[ori_bi] > 0) return;
    const int write_seq_id = seq_lens[ori_bi];
    if (write_seq_id == 0) continue;
@@ -366,8 +366,8 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -409,7 +409,7 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
    const int bias = linear_index % half_hidden_size;
    const int hi = bias / half_head_size;  // q + k + v
    const int h_bias = bias % half_head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    if (seq_lens_encoder[ori_bi] > 0) return;
    const int write_seq_id = seq_lens[ori_bi];
    if (write_seq_id == 0) continue;
@@ -498,8 +498,8 @@ __global__ void append_decode_cache_int8_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -523,7 +523,7 @@ __global__ void append_decode_cache_int8_rope_kernel(
  int q_head_idx, k_head_idx, v_idx;
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -745,8 +745,8 @@ __global__ void append_decode_cache_int8_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -775,7 +775,7 @@ __global__ void append_decode_cache_int8_rope_kernel(
  int q_head_idx, k_head_idx, v_idx;
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -1047,8 +1047,8 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -1073,7 +1073,7 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
  int q_head_idx, k_head_idx, v_idx;
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -1346,8 +1346,8 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -1377,7 +1377,7 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(

  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -1739,8 +1739,8 @@ __global__ void append_decode_cache_int4_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -1766,7 +1766,7 @@ __global__ void append_decode_cache_int4_rope_kernel(
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
  const int half_block_size = block_size / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -2034,8 +2034,8 @@ __global__ void append_decode_cache_int4_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -2066,7 +2066,7 @@ __global__ void append_decode_cache_int4_rope_kernel(
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
  const int half_block_size = block_size / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -2362,8 +2362,8 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -2389,7 +2389,7 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
  const int half_block_size = block_size / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
@@ -2732,8 +2732,8 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -2764,7 +2764,7 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
  constexpr int half_head_size = HeadDim / 2;
  const int half_block_size = block_size / 2;
-  const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
+  const int start_token_idx = cu_seqlens_q[bid];
  if (seq_lens_encoder[bid] > 0) return;
  const int write_seq_id = seq_lens[bid];
  if (write_seq_id == 0) return;
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
@@ -21,8 +21,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
                              T* value_cache,
                              T* qkv_out,
                              const int* block_tables,
-                              const int* padding_offsets,
-                              const int* cum_offsets,
+                              const int* batch_id_per_token,
+                              const int* cu_seqlens_q,
                              const int* seq_lens,
                              const int* seq_lens_encoder,
                              const float* cos_emb,
@@ -57,8 +57,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -79,8 +79,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
                                                value_cache,
                                                qkv_out,
                                                block_tables,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                seq_lens,
                                                seq_lens_encoder,
                                                cos_emb,
@@ -102,8 +102,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -125,8 +125,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
                                                value_cache,
                                                qkv_out,
                                                block_tables,
-                                                padding_offsets,
-                                                cum_offsets,
+                                                batch_id_per_token,
+                                                cu_seqlens_q,
                                                seq_lens,
                                                seq_lens_encoder,
                                                cos_emb,
@@ -149,8 +149,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
                                   uint8_t* value_cache,
                                   T* qkv_out,
                                   const int* block_tables,
-                                   const int* padding_offsets,
-                                   const int* cum_offsets,
+                                   const int* batch_id_per_token,
+                                   const int* cu_seqlens_q,
                                   const int* seq_lens,
                                   const int* seq_lens_encoder,
                                   const float* cos_emb,
@@ -182,8 +182,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -207,8 +207,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -232,8 +232,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -257,8 +257,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -282,8 +282,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
                                   uint8_t* value_cache,
                                   T* qkv_out,
                                   const int* block_tables,
-                                   const int* padding_offsets,
-                                   const int* cum_offsets,
+                                   const int* batch_id_per_token,
+                                   const int* cu_seqlens_q,
                                   const int* seq_lens,
                                   const int* seq_lens_encoder,
                                   const float* cos_emb,
@@ -317,8 +317,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -344,8 +344,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -371,8 +371,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -398,8 +398,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
              value_cache,
              qkv_out,
              block_tables,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_encoder,
              cos_emb,
@@ -424,8 +424,8 @@ void DecoderWriteCacheWithRoPEKernel(
    const paddle::Tensor& qkv,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -471,8 +471,8 @@ void DecoderWriteCacheWithRoPEKernel(
        reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
        reinterpret_cast<DataType_*>(qkv_out->data<T>()),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -503,8 +503,8 @@ void DecoderWriteCacheWithRoPEKernel(
        value_cache_out->data<uint8_t>(),
        reinterpret_cast<DataType_*>(qkv_out->data<T>()),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -536,8 +536,8 @@ void DecoderWriteCacheWithRoPEKernel(
          value_cache_out->data<uint8_t>(),
          reinterpret_cast<DataType_*>(qkv_out->data<T>()),
          block_tables.data<int>(),
-          padding_offsets.data<int>(),
-          cum_offsets.data<int>(),
+          batch_id_per_token.data<int>(),
+          cu_seqlens_q.data<int>(),
          seq_lens.data<int>(),
          seq_lens_encoder.data<int>(),
          cos_emb,
@@ -570,8 +570,8 @@ void DecoderWriteCacheWithRoPEKernel(
          value_cache_out->data<uint8_t>(),
          reinterpret_cast<DataType_*>(qkv_out->data<T>()),
          block_tables.data<int>(),
-          padding_offsets.data<int>(),
-          cum_offsets.data<int>(),
+          batch_id_per_token.data<int>(),
+          cu_seqlens_q.data<int>(),
          seq_lens.data<int>(),
          seq_lens_encoder.data<int>(),
          cos_emb,
@@ -603,8 +603,8 @@ void DecoderWriteCacheWithRoPEKernel(
        value_cache_out->data<uint8_t>(),
        reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -650,8 +650,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
              // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -677,8 +677,8 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
              // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -703,8 +703,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
              // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -729,8 +729,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
              // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
@@ -23,8 +23,8 @@ void DecoderWriteCacheWithRoPEKernel(
              // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -40,4 +40,4 @@ void DecoderWriteCacheWithRoPEKernel(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out);
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
@@ -23,7 +23,8 @@ __global__ void VariableLengthRotaryKernel(
    const int *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const float *qkv_out_scales,  // [3, num_head, dim_head]
@@ -52,8 +53,7 @@ __global__ void VariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int qkv_id = bias / hidden_size;
@@ -61,7 +61,7 @@ __global__ void VariableLengthRotaryKernel(
    const int hi = qkv_bias / last_dim;
    const int h_bias = qkv_bias % last_dim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias;
@@ -107,7 +107,8 @@ __global__ void VariableLengthRotaryKernel(
    const T *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    T *qkv_out,
@@ -130,8 +131,7 @@ __global__ void VariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int qkv_id = bias / hidden_size;
@@ -139,7 +139,7 @@ __global__ void VariableLengthRotaryKernel(
    const int hi = qkv_bias / last_dim;
    const int h_bias = qkv_bias % last_dim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int64_t base_idx = token_idx * 3 * hidden_size +
@@ -167,7 +167,8 @@ __global__ void NeoxVariableLengthRotaryKernel(
    const int *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const float *qkv_out_scales,  // [3, num_head, dim_head]
@@ -199,8 +200,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int qkv_id = bias / hidden_size;
@@ -208,7 +208,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
    const int hi = qkv_bias / half_lastdim;
    const int h_bias = qkv_bias % half_lastdim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * last_dim + h_bias;
    const int bias_idx_left =
@@ -261,7 +261,8 @@ __global__ void NeoxVariableLengthRotaryKernel(
    const T *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    T *qkv_out,
@@ -285,8 +286,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int qkv_id = bias / hidden_size;
@@ -294,7 +294,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
    const int hi = qkv_bias / half_lastdim;
    const int h_bias = qkv_bias % half_lastdim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * last_dim + h_bias;
    const int base_idx_left = token_idx * 3 * full_hidden_size +
@@ -327,7 +327,8 @@ __global__ void GQAVariableLengthRotaryKernel(
    const int *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const float *qkv_out_scales,  // [3, q_num_head, dim_head]
@@ -357,14 +358,13 @@ __global__ void GQAVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];;
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
    const int h_bias = bias % last_dim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int64_t bias_idx = hi * last_dim + h_bias;
@@ -410,7 +410,8 @@ __global__ void GQAVariableLengthRotaryKernel(
    const T *qkv,
    const float *cos_emb,
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    T *qkv_out,
@@ -434,14 +435,13 @@ __global__ void GQAVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];;
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
    const int h_bias = bias % last_dim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int64_t base_idx =
@@ -472,7 +472,8 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const int *qkv,
                                           const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
                                           const float *sin_emb,
                                           const float *qkv_out_scales,
-                                           const int *padding_offsets,
+                                           const int *batch_id_per_token,
+                                           const int *cu_seqlens_q,
                                           const int *seq_lens,
                                           const int *seq_lens_decoder,
                                           const T *qkv_biases,
@@ -504,15 +505,13 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const int *qkv,
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
    const int h_bias = bias % last_dim;

-    int ori_seq_id;
-    ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int64_t bias_idx = hi * last_dim + h_bias;
@@ -561,7 +560,8 @@ template <typename T, int VecSize = 1>
 __global__ void GQAVariableLengthRotaryQuantKVKernel(const T *qkv,
                                           const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
                                           const float *sin_emb,
-                                           const int *padding_offsets,
+                                           const int *batch_id_per_token,
+                                           const int *cu_seqlens_q,
                                           const int *seq_lens,
                                           const int *seq_lens_decoder,
                                           const T *qkv_biases,
@@ -590,15 +590,13 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const T *qkv,
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
    const int h_bias = bias % last_dim;

-    int ori_seq_id;
-    ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
    const int64_t bias_idx = hi * last_dim + h_bias;
@@ -645,7 +643,8 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
    const int *qkv,
    const float *cos_emb,  // [1, 1, seq_len, dim_head / 2]
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const float *qkv_out_scales,  // [3, q_num_head, dim_head]
@@ -676,14 +675,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / half_lastdim;
    const int h_bias = bias % half_lastdim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * last_dim + h_bias;
    const int bias_idx_left = hi * last_dim + h_bias;
@@ -736,7 +734,8 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
    const T *qkv,
    const float *cos_emb,
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const float *qkv_out_scales,
@@ -761,14 +760,13 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens && seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / half_lastdim;
    const int h_bias = bias % half_lastdim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int emb_idx = ori_seq_id * last_dim + h_bias;
    const int base_idx_left =
@@ -805,7 +803,8 @@ __global__ void cache_kernel(
    T *__restrict__ value_cache,  // [num_blocks, kv_num_heads, block_size,
                                  // head_size]
    const int *__restrict__ block_tables,      // [bsz, max_blocks_per_seq]
-    const int *__restrict__ padding_offsets,   // [num_tokens]
+    const int *__restrict__ batch_id_per_token,   // [num_tokens]
+    const int *__restrict__ cu_seqlens_q,   // [bsz]
    const int *__restrict__ seq_lens,          // [bsz]
    const int *__restrict__ seq_lens_decoder,  // [bsz]
    const int max_seq_len,
@@ -831,11 +830,9 @@ __global__ void cache_kernel(
    const uint32_t qkv_bias = bias % hidden_size;
    const uint32_t hi = qkv_bias / head_size;
    const uint32_t h_bias = qkv_bias % head_size;
-    const uint32_t ori_token_idx = token_idx + padding_offsets[token_idx];
-    const uint32_t ori_bi = ori_token_idx / max_seq_len;
+    const uint32_t ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
-    const uint32_t ori_seq_id =
-        ori_token_idx % max_seq_len + seq_lens_decoder[ori_bi];
+    const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int32_t *block_table_now = nullptr;

@@ -878,8 +875,8 @@ __global__ void append_write_cache_kv_c8_qkv(
    const int *__restrict__ tile_ids,
    const int *__restrict__ seq_lens_this_time,
    const int *__restrict__ seq_lens_decoder,
-    const int *__restrict__ padding_offsets,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ batch_id_per_token,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_tables,
    const int max_seq_len,
    const int max_blocks_per_seq,
@@ -909,15 +906,46 @@ __global__ void append_write_cache_kv_c8_qkv(
  const uint32_t end_len = start_len + seq_len_this_time;

  const uint32_t tile_start = start_len_pad + tile_id * num_rows_per_block;
+  int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
  uint32_t chunk_start = tile_start + wid * num_frags_z * 16 + tid / 8;

-  const uint32_t start_token_idx =
-      batch_id * max_seq_len - cum_offsets[batch_id];
+  const uint32_t start_token_idx = cu_seqlens_q[batch_id];
  const uint32_t kv_batch_stride = (num_heads + 2 * kv_num_heads) * HEAD_DIM;
  const uint32_t kv_h_stride = HEAD_DIM;
  __shared__ T k_smem_ori[num_rows_per_block * HEAD_DIM];
  __shared__ T v_smem_ori[num_rows_per_block * HEAD_DIM];
+  if (tile_start >= start_len) {
+    constexpr int KV_VEC_SIZE = 16 / sizeof(uint8_t);  // 16
+    using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
+    // int lane_id = wid * 32 + tid;
+    // pad zero for this kv_head_idx for this block
+    LoadPadKVT pad_cache_vec;
+    *(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
+    // reset k
+    constexpr int num_vecs_per_head_k = HEAD_DIM / KV_VEC_SIZE;
+    constexpr int num_token_each_time_k = 32 / num_vecs_per_head_k;
+    uint32_t tgt_idx =
+        (block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE * HEAD_DIM +
+        tid % num_vecs_per_head_k * KV_VEC_SIZE;
+    for (int block_i = tid / num_vecs_per_head_k;
+          block_i < BLOCK_SIZE;
+          block_i += num_token_each_time_k) {
+      Store<uint8_t, KV_VEC_SIZE>(pad_cache_vec,
+                                  &cache_k[tgt_idx + block_i * HEAD_DIM]);
+    }

+    // reset v
+    const int num_vecs_per_head_v = BLOCK_SIZE / KV_VEC_SIZE;
+    const int num_token_each_time_v = 32 / num_vecs_per_head_v;
+    tgt_idx =
+        (block_id * kv_num_heads + kv_head_idx) * HEAD_DIM * BLOCK_SIZE +
+        tid % num_vecs_per_head_v * KV_VEC_SIZE;
+    for (int block_i = tid / num_vecs_per_head_v; block_i < HEAD_DIM;
+          block_i += num_token_each_time_v) {
+      Store<uint8_t, KV_VEC_SIZE>(
+          pad_cache_vec, &cache_v[tgt_idx + block_i * BLOCK_SIZE]);
+    }
+  }
  smem_t k_smem(k_smem_ori);
  smem_t v_smem(v_smem_ori);

@@ -980,7 +1008,6 @@ __global__ void append_write_cache_kv_c8_qkv(

  uint32_t chunk_start_k = tile_start + wid * num_frags_z * 16 + tid / 4;
  uint32_t kv_frag[4];
-  int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
  const uint32_t write_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
  const uint32_t write_h_stride = BLOCK_SIZE * HEAD_DIM;
  const uint32_t write_b_stride = HEAD_DIM;
@@ -1118,8 +1145,8 @@ __global__ void append_write_cache_kv_c4_qkv(
    const int *__restrict__ tile_ids,
    const int *__restrict__ seq_lens_this_time,
    const int *__restrict__ seq_lens_decoder,
-    const int *__restrict__ padding_offsets,
-    const int *__restrict__ cum_offsets,
+    const int *__restrict__ batch_id_per_token,
+    const int *__restrict__ cu_seqlens_q,
    const int *__restrict__ block_tables,
    const int max_seq_len,
    const int max_blocks_per_seq,
@@ -1148,10 +1175,46 @@ __global__ void append_write_cache_kv_c4_qkv(
  const uint32_t tile_start = start_len_pad + tile_id * num_rows_per_block;
  uint32_t chunk_start = tile_start + wid * num_frags_z * 16 + tid / 8;

-  const uint32_t start_token_idx =
-      batch_id * max_seq_len - cum_offsets[batch_id];
+  const uint32_t start_token_idx = cu_seqlens_q[batch_id];
  const uint32_t kv_batch_stride = (num_heads + 2 * kv_num_heads) * HEAD_DIM;
  const uint32_t kv_h_stride = HEAD_DIM;
+  int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
+
+  const uint32_t HEAD_DIM_HALF = HEAD_DIM / 2;
+  const uint32_t BLOCK_SIZE_HALF = BLOCK_SIZE / 2;
+
+  if (tile_start >= start_len) {
+    constexpr int KV_VEC_SIZE = 16 / sizeof(uint8_t);  // 16
+    using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
+    // pad zero for this kv_head_idx for this block
+    LoadPadKVT pad_cache_vec;
+    *(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
+    // reset k
+    constexpr int num_vecs_per_head_k = HEAD_DIM_HALF / KV_VEC_SIZE; // 4
+    constexpr int num_token_each_time_k = 32 / num_vecs_per_head_k; // 8
+    uint32_t tgt_idx =
+        (block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE * HEAD_DIM_HALF +
+        tid % num_vecs_per_head_k * KV_VEC_SIZE;
+    for (int block_i = tid / num_vecs_per_head_k;
+          block_i < BLOCK_SIZE;
+          block_i += num_token_each_time_k) {
+      Store<uint8_t, KV_VEC_SIZE>(pad_cache_vec,
+                                  &cache_k[tgt_idx + block_i * HEAD_DIM_HALF]);
+    }
+
+    // reset v
+    const int num_vecs_per_head_v = BLOCK_SIZE_HALF / KV_VEC_SIZE; // 2
+    const int num_token_each_time_v = 32 / num_vecs_per_head_v;  // 16
+    tgt_idx =
+        (block_id * kv_num_heads + kv_head_idx) * HEAD_DIM * BLOCK_SIZE_HALF +
+        tid % num_vecs_per_head_v * KV_VEC_SIZE;
+    for (int block_i = tid / num_vecs_per_head_v; block_i < HEAD_DIM;
+          block_i += num_token_each_time_v) {
+      Store<uint8_t, KV_VEC_SIZE>(
+          pad_cache_vec, &cache_v[tgt_idx + block_i * BLOCK_SIZE_HALF]);
+    }
+  }
+
  __shared__ T k_smem_ori[num_rows_per_block * HEAD_DIM];
  __shared__ T v_smem_ori[num_rows_per_block * HEAD_DIM];
  __shared__ T k_scale_smem[HEAD_DIM];
@@ -1262,7 +1325,6 @@ __global__ void append_write_cache_kv_c4_qkv(

  uint32_t chunk_start_k = tile_start + wid * num_frags_z * 16 + tid / 4;
  uint32_t kv_frag[4];
-  int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
  const uint32_t write_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM / 2;
  const uint32_t write_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
  const uint32_t write_b_stride = HEAD_DIM / 2;
@@ -1407,7 +1469,8 @@ void rotary_qk_variable(
    const float *qkv_out_scales,  // [3, num_head, dim_head]
    const T *qkv_bias,
    const float *rotary_emb,  // [2, 1, 1, seq_len, dim_head / 2]
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const int token_num,
@@ -1439,7 +1502,8 @@ void rotary_qk_variable(
              reinterpret_cast<const int *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out_scales,
@@ -1455,7 +1519,8 @@ void rotary_qk_variable(
              reinterpret_cast<const T *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out,
@@ -1473,7 +1538,8 @@ void rotary_qk_variable(
              reinterpret_cast<const int *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out_scales,
@@ -1489,7 +1555,8 @@ void rotary_qk_variable(
              reinterpret_cast<const T *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out,
@@ -1508,7 +1575,8 @@ void gqa_rotary_qk_variable(
    const float *qkv_out_scales,  // [3, num_head, dim_head]
    const T *qkv_bias,
    const float *rotary_emb,  // [2, 1, 1, seq_len, dim_head / 2]
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const int token_num,
@@ -1543,7 +1611,8 @@ void gqa_rotary_qk_variable(
              reinterpret_cast<const int *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out_scales,
@@ -1561,7 +1630,8 @@ void gqa_rotary_qk_variable(
              reinterpret_cast<const T *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out,
@@ -1581,7 +1651,8 @@ void gqa_rotary_qk_variable(
              reinterpret_cast<const int *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out_scales,
@@ -1598,7 +1669,8 @@ void gqa_rotary_qk_variable(
              reinterpret_cast<const T *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_out_scales,
@@ -1622,7 +1694,8 @@ void gqa_rotary_qk_quant_variable(
    const T *cache_k_scales,
    const T *cache_v_scales,
    const float *rotary_emb,  // [2, 1, 1, seq_len, dim_head / 2]
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const int token_num,
@@ -1654,7 +1727,8 @@ void gqa_rotary_qk_quant_variable(
              cos_emb,
              sin_emb,
              qkv_out_scales,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_bias,
@@ -1673,7 +1747,8 @@ void gqa_rotary_qk_quant_variable(
              reinterpret_cast<const T *>(qkv_input),
              cos_emb,
              sin_emb,
-              padding_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              seq_lens,
              seq_lens_decoder,
              qkv_bias,
@@ -1699,7 +1774,8 @@ void CascadeAppendWriteCacheKVQKV(
        &qkv,  // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
               // kv_num_heads, head_dim] if GQA)
    const paddle::Tensor &block_table,
-    const paddle::Tensor &padding_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const int max_seq_len,
@@ -1725,7 +1801,8 @@ void CascadeAppendWriteCacheKVQKV(
      reinterpret_cast<T *>(key_cache_out->data<T>()),
      reinterpret_cast<T *>(value_cache_out->data<T>()),
      block_table.data<int>(),
-      padding_offsets.data<int>(),
+      batch_id_per_token.data<int>(),
+      cu_seqlens_q.data<int>(),
      seq_lens_encoder.data<int>(),
      seq_lens_decoder.data<int>(),
      max_seq_len,
@@ -1749,8 +1826,8 @@ void CascadeAppendWriteCacheKVC8QKV(
    const paddle::Tensor &cache_v_scale,  // [num_kv_heads, head_dim]
    const paddle::Tensor &seq_lens_this_time,
    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    const paddle::Tensor &batch_ids,
    const paddle::Tensor &tile_ids_per_batch,
@@ -1814,8 +1891,8 @@ void CascadeAppendWriteCacheKVC8QKV(
                                          tile_ids_per_batch.data<int>(),
                                          seq_lens_this_time.data<int>(),
                                          seq_lens_decoder.data<int>(),
-                                          padding_offsets.data<int>(),
-                                          cum_offsets.data<int>(),
+                                          batch_id_per_token.data<int>(),
+                                          cu_seqlens_q.data<int>(),
                                          block_table.data<int>(),
                                          max_seq_len,
                                          max_blocks_per_seq,
@@ -1837,8 +1914,8 @@ void CascadeAppendWriteCacheKVC4QKV(
    const paddle::Tensor &cache_v_zp,     // [num_kv_heads, head_dim]
    const paddle::Tensor &seq_lens_this_time,
    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    const paddle::Tensor &batch_ids,
    const paddle::Tensor &tile_ids_per_batch,
@@ -1884,8 +1961,8 @@ void CascadeAppendWriteCacheKVC4QKV(
                                          tile_ids_per_batch.data<int>(),
                                          seq_lens_this_time.data<int>(),
                                          seq_lens_decoder.data<int>(),
-                                          padding_offsets.data<int>(),
-                                          cum_offsets.data<int>(),
+                                          batch_id_per_token.data<int>(),
+                                          cu_seqlens_q.data<int>(),
                                          block_table.data<int>(),
                                          max_seq_len,
                                          max_blocks_per_seq,
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
@@ -25,8 +25,8 @@ void EncoderWriteCacheWithRopeKernel(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids,
@@ -63,7 +63,8 @@ void EncoderWriteCacheWithRopeKernel(
        qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
        qkv_biases ? qkv_biases.get().data<T>() : nullptr,
        rotary_embs.get().data<float>(),
-        padding_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens_encoder.data<int>(),
        seq_lens_decoder.data<int>(),
        token_num,
@@ -82,7 +83,8 @@ void EncoderWriteCacheWithRopeKernel(
        qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
        qkv_biases ? qkv_biases.get().data<T>() : nullptr,
        rotary_embs.get().data<float>(),
-        padding_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens_encoder.data<int>(),
        seq_lens_decoder.data<int>(),
        token_num,
@@ -103,7 +105,8 @@ void EncoderWriteCacheWithRopeKernel(
        cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
        cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
        rotary_embs.get().data<float>(),
-        padding_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens_encoder.data<int>(),
        seq_lens_decoder.data<int>(),
        token_num,
@@ -123,7 +126,8 @@ void EncoderWriteCacheWithRopeKernel(
    CascadeAppendWriteCacheKVQKV<T>(meta_data,
                                    *qkv_out,
                                    block_tables,
-                                    padding_offsets,
+                                    batch_id_per_token,
+                                    cu_seqlens_q,
                                    seq_lens_encoder,
                                    seq_lens_decoder,
                                    max_seq_len,
@@ -142,8 +146,8 @@ void EncoderWriteCacheWithRopeKernel(
              cache_v_scale.get(),
              seq_lens_this_time,
              seq_lens_decoder,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              block_tables,
              batch_ids,
              tile_ids,
@@ -169,8 +173,8 @@ void EncoderWriteCacheWithRopeKernel(
              cache_v_zp.get(),
              seq_lens_this_time,
              seq_lens_decoder,
-              padding_offsets,
-              cum_offsets,
+              batch_id_per_token,
+              cu_seqlens_q,
              block_tables,
              batch_ids,
              tile_ids,
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -194,12 +194,12 @@ get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
 std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &seq_lens_this_time,
    const int encoder_block_shape_q, const int decoder_block_shape_q,
    const int group_size, const int block_size,
    const int decoder_step_token_num) {
  auto stream = seq_lens_encoder.stream();
-  int bsz = cum_offsets.shape()[0];
+  int bsz = seq_lens_this_time.shape()[0];
  auto max_len_tensor =
      GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
  GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
@@ -335,8 +335,7 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
 std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
    const paddle::DataType &seq_lens_encoder_dtype,
    const paddle::DataType &seq_lens_decoder_dtype,
-    const paddle::DataType &seq_lens_this_time_dtype,
-    const paddle::DataType &cum_offsets_dtype) {
+    const paddle::DataType &seq_lens_this_time_dtype) {
  return {
      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
@@ -347,8 +346,7 @@ std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
 std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
    const std::vector<int64_t> &seq_lens_encoder_shape,
    const std::vector<int64_t> &seq_lens_decoder_shape,
-    const std::vector<int64_t> &seq_lens_this_time_shape,
-    const std::vector<int64_t> &cum_offsets_shape) {
+    const std::vector<int64_t> &seq_lens_this_time_shape) {
  std::vector<int64_t> dynamic_shape = {-1};

  return {dynamic_shape,
@@ -365,8 +363,7 @@ std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
 }

 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
-    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
-             "cum_offsets"})
+    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time"})
    .Outputs({paddle::Optional("encoder_batch_ids"),
              paddle::Optional("encoder_tile_ids_per_batch"),
              paddle::Optional("encoder_num_blocks"),
--- a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
+++ b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
@@ -16,7 +16,6 @@
 #include "paddle/extension.h"
 #include "paddle/phi/core/memory/memcpy.h"
 #include "encoder_write_cache_with_rope_impl.cuh"
-#include "paddle/phi/kernels/gpu/flash_attn_v3_kernel.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "remote_cache_kv_ipc.h"

@@ -25,7 +24,8 @@ __global__ void GQAVariableLengthRotarySplitKernel(
    const T *qkv,
    const float *cos_emb,
    const float *sin_emb,
-    const int *padding_offsets,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
    const int *seq_lens,
    const int *seq_lens_decoder,
    const int *cu_seqlens_k,
@@ -52,14 +52,13 @@ __global__ void GQAVariableLengthRotarySplitKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_token_idx = token_idx + padding_offsets[token_idx];
-    const int ori_bi = ori_token_idx / seq_len;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
    const int h_bias = bias % last_dim;

-    const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
    const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id;

    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
@@ -108,9 +107,10 @@ void gqa_rotary_qk_split_variable(
    T *v,
    const T *qkv_input,
    const float *rotary_emb,  // [2, 1, 1, seq_len, dim_head / 2]
-    const int *padding_offsets,
+    const int *batch_id_per_token,
    const int *seq_lens_encoder,
    const int *seq_lens_decoder,
+    const int *cu_seqlens_q,
    const int *cu_seqlens_k,
    const int token_num,
    const int num_heads,
@@ -133,7 +133,8 @@ void gqa_rotary_qk_split_variable(
            qkv_input,
            cos_emb,
            sin_emb,
-            padding_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            seq_lens_encoder,
            seq_lens_decoder,
            cu_seqlens_k,
@@ -148,13 +149,188 @@ void gqa_rotary_qk_split_variable(
            dim_head);
 }

+template <typename T,
+          typename CacheT,
+          uint32_t HEAD_DIM,
+          uint32_t BLOCK_SIZE,
+          uint32_t NUM_WARPS=4>
+__global__ void append_cache_kv_c16(
+    const T *__restrict__ cache_k,
+    const T *__restrict__ cache_v,
+    T *__restrict__ k_out,
+    T *__restrict__ v_out,
+    const int *__restrict__ seq_lens_this_time,
+    const int *__restrict__ seq_lens_decoder,
+    const int *__restrict__ cu_seqlens_k,
+    const int *__restrict__ block_tables,
+    const int *batch_ids,
+    const int *tile_ids_per_batch,
+    const int max_blocks_per_seq,
+    const int kv_num_heads) {
+  // start_kv_idx: start kv_idx current block
+  // batch_id：block's batch_id
+  // TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with template（int8/fp8)
+  const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
+  const uint32_t tid = threadIdx.x, wid = threadIdx.y;
+
+  const uint32_t batch_id = batch_ids[tile_idx];
+  const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
+  const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
+  if (seq_lens_this_time[batch_id] <= 0) {
+    return;
+  }
+
+  const int *cur_block_table = block_tables + batch_id * max_blocks_per_seq;
+  uint32_t block_id = cur_block_table[start_kv_idx / BLOCK_SIZE];
+  // cache_kv idx
+  uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
+  uint32_t block_stride = kv_num_heads * kv_h_stride;
+  const CacheT *cur_cache_k = cache_k + block_id * block_stride + kv_head_idx * kv_h_stride;
+  const CacheT *cur_cache_v = cache_v + block_id * block_stride + kv_head_idx * kv_h_stride;
+
+  // k_out v_out idx
+  uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
+  T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
+  T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
+
+  uint32_t kv_frag[4];
+  T *frag_dq_T = reinterpret_cast<T *>(kv_frag);
+
+  constexpr uint32_t num_vecs_per_head =
+      HEAD_DIM / num_elems_per_128b<CacheT>();
+  constexpr uint32_t inv_kv_stride = 8 / num_vecs_per_head;
+
+  extern __shared__ uint8_t smem[];
+  smem_t k_smem(smem);
+  uint32_t k_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
+      wid * 4 + tid / 8, tid % 8);  // 4 * 4 per warp
+
+  uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
+      wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
+
+  uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
+                          tid % 8 * num_elems_per_128b<CacheT>();
+
+  // load k_smem 64 rows 128 cols
+  for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
+    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
+      k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
+            k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
+      k_smem_offset_w =
+            k_smem.advance_offset_by_column<8, num_vecs_per_head>(k_smem_offset_w, fy);
+      k_read_idx += 8 * num_elems_per_128b<CacheT>();
+    }
+    k_smem_offset_w =
+          k_smem.advance_offset_by_row<4 * NUM_WARPS, num_vecs_per_head>(k_smem_offset_w) - 16;
+    k_read_idx += 4 * NUM_WARPS * HEAD_DIM - 16 * num_elems_per_128b<CacheT>();
+  }
+  commit_group();
+  wait_group<0>();
+  __syncthreads();
+
+  // deal k_smem 64 rows 128 cols
+  for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
+    uint32_t row_idx = wid * 16 + tid / 4;
+    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
+      uint32_t col_idx = fy * 16 + tid % 4 * 2;
+      k_smem.ldmatrix_m8n8x4(k_smem_offset_r, kv_frag);
+      // layout
+      /***
+        r0c0,r0c1, r0c8,r0c9
+        r8c0,r8c1, r8c8,r8c9
+      ***/
+      T *k_tile_ptr0 = k_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
+      T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;
+
+      if (row_idx < end_idx) {
+        k_tile_ptr0[0] = frag_dq_T[0];
+        k_tile_ptr0[1] = frag_dq_T[1];
+        k_tile_ptr0[8] = frag_dq_T[2];
+        k_tile_ptr0[9] = frag_dq_T[3];
+      }
+
+      if (row_idx + 8 < end_idx) {
+        k_tile_ptr1[0] = frag_dq_T[4];
+        k_tile_ptr1[1] = frag_dq_T[5];
+        k_tile_ptr1[8] = frag_dq_T[6];
+        k_tile_ptr1[9] = frag_dq_T[7];
+      }
+      k_smem_offset_r = k_smem.advance_offset_by_column<2, num_vecs_per_head>(
+        k_smem_offset_r, fy);
+    }
+    k_smem_offset_r =
+      k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head>(k_smem_offset_r) - 16;
+  }
+
+  // ================v================
+  smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
+  uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
+      wid * 4 + tid / 8, tid % 8);  // 4 * 4 per warp
+  uint32_t v_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
+      wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
+
+  uint32_t v_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
+                          tid % 8 * num_elems_per_128b<CacheT>();
+
+  // load v_smem 64 rows 128 cols
+  for (int fz = 0; fz < 4; fz++) { // // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
+    for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
+      v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
+            v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
+      v_smem_offset_w =
+            v_smem.advance_offset_by_column<8, num_vecs_per_head>(v_smem_offset_w, fy);
+      v_read_idx += 8 * num_elems_per_128b<CacheT>();
+    }
+    v_smem_offset_w =
+          v_smem.advance_offset_by_row<4 * NUM_WARPS, num_vecs_per_head>(v_smem_offset_w) - 16;
+    v_read_idx += 4 * NUM_WARPS * HEAD_DIM - 16 * num_elems_per_128b<CacheT>();
+  }
+  commit_group();
+  wait_group<0>();
+  __syncthreads();
+
+  // deal v_smem 64 rows 128 cols
+  for (int fz = 0; fz < 1; fz++) { //  16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
+    uint32_t row_idx = wid * 16 + tid / 4;
+    for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
+      uint32_t col_idx = fy * 16 + tid % 4 * 2;
+      v_smem.ldmatrix_m8n8x4(v_smem_offset_r, kv_frag);
+      // layout
+      /***
+        r0c0,r0c1, r0c8,r0c9
+        r8c0,r8c1, r8c8,r8c9
+      ***/
+      T *v_tile_ptr0 = v_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
+      T *v_tile_ptr1 = v_tile_ptr0 + 8 * kv_t_stride;
+
+      if (row_idx < end_idx) {
+        v_tile_ptr0[0] = frag_dq_T[0];
+        v_tile_ptr0[1] = frag_dq_T[1];
+        v_tile_ptr0[8] = frag_dq_T[2];
+        v_tile_ptr0[9] = frag_dq_T[3];
+      }
+
+      if (row_idx + 8 < end_idx) {
+        v_tile_ptr1[0] = frag_dq_T[4];
+        v_tile_ptr1[1] = frag_dq_T[5];
+        v_tile_ptr1[8] = frag_dq_T[6];
+        v_tile_ptr1[9] = frag_dq_T[7];
+      }
+      v_smem_offset_r = v_smem.advance_offset_by_column<2, num_vecs_per_head>(
+        v_smem_offset_r, fy);
+    }
+    v_smem_offset_r =
+      v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head>(v_smem_offset_r) - 16;
+  }
+}
+
 template <typename T,
          typename CacheT,
          uint32_t HEAD_DIM,
          uint32_t BLOCK_SIZE,
          uint32_t NUM_WARPS=4,
          bool IS_FP8=false>
-__global__ void append_dequant_cache_kv_c8(
+__global__ void append_cache_kv_c8(
    const CacheT *__restrict__ cache_k,
    const CacheT *__restrict__ cache_v,
    T *__restrict__ k_out,
@@ -169,16 +345,16 @@ __global__ void append_dequant_cache_kv_c8(
    const int *tile_ids_per_batch,
    const int max_blocks_per_seq,
    const int kv_num_heads) {
-  // start_kv_idx: 每个block的起始kv_idx
-  // batch_id：每个block属于的batch
-  // TODO: 1.scale预取 2.frag_dq_T复用 3.流水线编排 4.store访存合并 5.cacheT支持（int8/fp8)
+  // start_kv_idx: start kv_idx current block
+  // batch_id：block's batch_id
+  // TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with template（int8/fp8)
  const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
  const uint32_t tid = threadIdx.x, wid = threadIdx.y;

  const uint32_t batch_id = batch_ids[tile_idx];
  const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
  const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
-  if (seq_lens_this_time <= 0) {
+  if (seq_lens_this_time[batch_id] <= 0) {
    return;
  }

@@ -192,8 +368,8 @@ __global__ void append_dequant_cache_kv_c8(

  // k_out v_out idx
  uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
-  T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride; // 当前k block起始指针
-  T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride; // 当前v block起始指针
+  T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
+  T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;

  uint32_t k_frag[4], v_frag[4], frag_dq[4];
  T *frag_dq_T = reinterpret_cast<T *>(frag_dq);
@@ -214,13 +390,13 @@ __global__ void append_dequant_cache_kv_c8(

  uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
      wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
-  
+
  uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
                          tid % 8 * num_elems_per_128b<CacheT>();

-  // load k_smem 行是64 列是128
-  for (int fz = 0; fz < 4; fz++) { // 每个warp1次4行,循环4次16行,4个warp64行
-    for (int fy = 0; fy < 1; fy++) { // 一次8个128b = 128个uint8
+  // load v_smem 64 rows, 128 cols
+  for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
+    for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 noce, need 1 iter
      k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
            k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
      k_smem_offset_w =
@@ -235,13 +411,13 @@ __global__ void append_dequant_cache_kv_c8(
  wait_group<0>();
  __syncthreads();

-  // deal k_smem 行是64 列是128
-  for (int fz = 0; fz < 1; fz++) { // 每个warp1次16行,4个warp64行
+  // deal k_smem 64 rows, 128 cols
+  for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
    uint32_t row_idx = wid * 16 + tid / 4;
-    for (int fy = 0; fy < 4; fy++) { // 1次2个128b(32个uint8),4次循环8个128b（128个uint8）
+    for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 noce, need 4 iter
      uint32_t col_idx = fy * 32 + tid % 4 * 2;
      k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
-      // 反量化 存储
+      // layout
      /***
      r0c0,r0c1,r0c8,r0c9, r8c0,r8c1,r8c8,r8c9
      r0c16,r0c17,r0c24,r0c25, r8c16,r8c17,r8c24,r8c25
@@ -251,8 +427,7 @@ __global__ void append_dequant_cache_kv_c8(
        T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;

        if (row_idx < end_idx) {
-          convert_c8<T,IS_FP8>(frag_dq_T,k_frag[2 * i]); // 4个uint8/fp8 -> 4个T
-
+          convert_c8<T,IS_FP8>(frag_dq_T,k_frag[2 * i]); // 4 * uint8/fp8 -> 4 * T
          k_tile_ptr0[0] = frag_dq_T[0] * cache_k_scale;
          k_tile_ptr0[1] = frag_dq_T[1] * cache_k_scale;
          k_tile_ptr0[8] = frag_dq_T[2] * cache_k_scale;
@@ -260,8 +435,7 @@ __global__ void append_dequant_cache_kv_c8(
        }

        if (row_idx + 8 < end_idx) {
-          convert_c8<T,IS_FP8>(frag_dq_T + 4,k_frag[2 * i + 1]); // 4个uint8/fp8 -> 4个T
-
+          convert_c8<T,IS_FP8>(frag_dq_T + 4,k_frag[2 * i + 1]); // 4 * uint8/fp8 -> 4 * T
          k_tile_ptr1[0] = frag_dq_T[4] * cache_k_scale;
          k_tile_ptr1[1] = frag_dq_T[5] * cache_k_scale;
          k_tile_ptr1[8] = frag_dq_T[6] * cache_k_scale;
@@ -275,8 +449,8 @@ __global__ void append_dequant_cache_kv_c8(
    k_smem_offset_r =
      k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_r) - 8;
  }
-  // ================v================

+  // ================v================
  smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
  uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
      wid * 8 + tid / 4, tid % 4);  // 4 * 8 per warp
@@ -286,9 +460,9 @@ __global__ void append_dequant_cache_kv_c8(

  uint32_t v_read_idx = (wid * 8 + tid / 4) * BLOCK_SIZE +
                          tid % 4 * num_elems_per_128b<CacheT>();
-  // load v_smem 行是128 列是64
-  for (int fy = 0; fy < 4; fy++) { // 每个warp1次8行,循环4次32行,4个warp128行
-    for (int fz = 0; fz < 1; fz++) { // 一次4个128b = 64个uint8
+  // load v_smem 128 rows 64 cols
+  for (int fy = 0; fy < 4; fy++) { // 8 rows pre warp once, 32 rows all 4 warps once, need 4 iter
+    for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 noce, need 1 iter
      v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
              v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
      v_smem_offset_w =
@@ -304,42 +478,32 @@ __global__ void append_dequant_cache_kv_c8(
  wait_group<0>();
  __syncthreads();

-  // deal v_smem 行是128 列是64 row_idx是head_dim, col_idx是block_size
-  for (int fy = 0; fy < 2; fy++) { // 每个warp1次16行,循环2次32行，4个warp128行
+  // deal v_smem 128 rows 64 cols
+  for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
    uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
-    for (int fz = 0; fz < 2; fz++) { // 1次2个128b(32个uint8),2次循环4个128b（64个uint8）
+    for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 noce, need 2 iter
      uint32_t kv_idx = fz * 32 + tid % 4 * 2;
      v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
-      // 反量化 存储
+      // layout
      for (int i = 0; i < 4 / 2; i++) {
        T *v_tile_ptr0 = v_write_ptr + kv_idx * kv_t_stride + kv_head_idx * HEAD_DIM + dim_idx;
        T *v_tile_ptr1 = v_tile_ptr0 + 8;
+        convert_c8<T,IS_FP8>(frag_dq_T, v_frag[2 * i]); // 4 * uint8/fp8 -> 4 * T
+        convert_c8<T,IS_FP8>(frag_dq_T + 4, v_frag[2 * i + 1]); // 4 * uint8/fp8 -> 4 * T
        if (kv_idx < end_idx) {
-          convert_c8<T,IS_FP8>(frag_dq_T, v_frag[2 * i]); // 4个uint8/fp8 -> 4个T
-#ifdef C8_DEBUG
-          if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) {
-            printf("1.fy: %d, fz:%d, row_idx: %d, col_idx: %d, v_frag: %.f, %.f, %.f, %.f \n",
-              fy, fz, kv_idx, dim_idx, static_cast<float>(frag_dq_T[0]), static_cast<float>(frag_dq_T[1]),
-              static_cast<float>(frag_dq_T[2]), static_cast<float>(frag_dq_T[3]));
-          }
-#endif
          v_tile_ptr0[0] = frag_dq_T[0] * cache_v_scale;
-          v_tile_ptr0[kv_t_stride] = frag_dq_T[1] * cache_v_scale;
-          v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale;
-          v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale;
-
-          
-          convert_c8<T,IS_FP8>(frag_dq_T + 4, v_frag[2 * i + 1]); // 4个uint8/fp8 -> 4个T
-#ifdef C8_DEBUG
-          if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) {
-            printf("2.fy: %d, fz:%d, row_idx: %d, col_idx: %d, v_frag: %.f, %.f, %.f, %.f \n",
-              fy, fz, kv_idx, dim_idx + 8, static_cast<float>(frag_dq_T[4]), static_cast<float>(frag_dq_T[5]),
-              static_cast<float>(frag_dq_T[6]), static_cast<float>(frag_dq_T[7]));
-          }
-#endif
          v_tile_ptr1[0] = frag_dq_T[4] * cache_v_scale;
+        }
+        if (kv_idx + 1 < end_idx) {
+          v_tile_ptr0[kv_t_stride] = frag_dq_T[1] * cache_v_scale;
          v_tile_ptr1[kv_t_stride] = frag_dq_T[5] * cache_v_scale;
+        }
+        if (kv_idx + 8 < end_idx) {
+          v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale;
          v_tile_ptr1[8 * kv_t_stride] = frag_dq_T[6] * cache_v_scale;
+        }
+        if (kv_idx + 9 < end_idx) {
+          v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale;
          v_tile_ptr1[9 * kv_t_stride] = frag_dq_T[7] * cache_v_scale;
        }
        kv_idx += 16;
@@ -352,12 +516,250 @@ __global__ void append_dequant_cache_kv_c8(
  }
 }

+template <typename T,
+          typename CacheT,
+          uint32_t HEAD_DIM,
+          uint32_t BLOCK_SIZE,
+          uint32_t NUM_WARPS=4>
+__global__ void append_cache_kv_c4(
+    const CacheT *__restrict__ cache_k,
+    const CacheT *__restrict__ cache_v,
+    T *__restrict__ k_out,
+    T *__restrict__ v_out,
+    const T *__restrict__ cache_k_dequant_scales,
+    const T *__restrict__ cache_v_dequant_scales,
+    const T *__restrict__ cache_k_zero_point,
+    const T *__restrict__ cache_v_zero_point,
+    const int *__restrict__ seq_lens_this_time,
+    const int *__restrict__ seq_lens_decoder,
+    const int *__restrict__ cu_seqlens_k,
+    const int *__restrict__ block_tables,
+    const int *batch_ids,
+    const int *tile_ids_per_batch,
+    const int max_blocks_per_seq,
+    const int kv_num_heads) {
+  // start_kv_idx: start kv_idx current block
+  // batch_id：block's batch_id
+  // TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with template（int8/fp8)
+  const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
+  const uint32_t tid = threadIdx.x, wid = threadIdx.y;
+
+  const uint32_t batch_id = batch_ids[tile_idx];
+  const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
+  const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
+  if (seq_lens_this_time[batch_id] <= 0) {
+    return;
+  }
+
+  const int *cur_block_table = block_tables + batch_id * max_blocks_per_seq;
+  uint32_t block_id = cur_block_table[start_kv_idx / BLOCK_SIZE];
+  if (block_id < 0) block_id = 0;
+
+  constexpr uint32_t HEAD_DIM_HALF = HEAD_DIM / 2;
+  constexpr uint32_t BLOCK_SIZE_HALF = BLOCK_SIZE / 2;
+  // cache_kv idx
+  uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM_HALF;
+  uint32_t block_stride = kv_num_heads * kv_h_stride;
+  const CacheT *cur_cache_k = cache_k + block_id * block_stride + kv_head_idx * kv_h_stride;
+  const CacheT *cur_cache_v = cache_v + block_id * block_stride + kv_head_idx * kv_h_stride;
+
+  // k_out v_out idx
+  uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
+  T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
+  T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
+
+  extern __shared__ uint8_t smem[];
+
+  uint32_t k_frag[4], v_frag[4], frag_dq[8];
+  T *frag_dq_T = reinterpret_cast<T *>(frag_dq);
+
+  // load dequant scales and zero points
+  const T *cache_k_scale_now = cache_k_dequant_scales + kv_head_idx * HEAD_DIM;
+  const T *cache_k_zp_now = cache_k_zero_point + kv_head_idx * HEAD_DIM;
+  const T *cache_v_scale_now = cache_v_dequant_scales + kv_head_idx * HEAD_DIM;
+  const T *cache_v_zp_now = cache_v_zero_point + kv_head_idx * HEAD_DIM;
+  T *cache_k_scale_smem = reinterpret_cast<T *>(
+      smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
+  T *cache_k_zero_point_smem = cache_k_scale_smem + HEAD_DIM;
+  T *cache_v_scale_smem = cache_k_zero_point_smem + HEAD_DIM;
+  T *cache_v_zero_point_smem = cache_v_scale_smem + HEAD_DIM;
+#pragma unroll
+  for (uint32_t i = wid * 32 + tid; i < HEAD_DIM; i += 128) {
+    cache_k_scale_smem[i] = cache_k_scale_now[i];
+    cache_k_zero_point_smem[i] = cache_k_zp_now[i] - static_cast<T>(136.f);
+    cache_v_scale_smem[i] = cache_v_scale_now[i];
+    cache_v_zero_point_smem[i] = cache_v_zp_now[i] - static_cast<T>(136.f);
+  }
+
+  smem_t k_smem(smem);
+  constexpr uint32_t num_vecs_per_head_k =
+      HEAD_DIM_HALF / num_elems_per_128b<CacheT>(); // 2
+  constexpr uint32_t num_vecs_per_blocksize =
+      BLOCK_SIZE_HALF / num_elems_per_128b<CacheT>();
+  constexpr uint32_t inv_k_stride = 8 / num_vecs_per_head_k; // 4
+  constexpr uint32_t inv_v_stride = 8 / num_vecs_per_blocksize;
+
+  uint32_t k_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
+      wid * 8 + tid / 4, tid % 4);  // 2(iter) * 4(warp) * 8 row per warp
+
+  uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
+      wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8); //
+
+  uint32_t k_read_idx = (wid * 8 + tid / 4) * HEAD_DIM / 2 +
+                          tid % 4 * num_elems_per_128b<CacheT>();
+
+  // load k_smem 64 rows 128 cols
+  for (int fz = 0; fz < 2; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
+    for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 noce, need 1 iter
+      k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
+            k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
+      k_smem_offset_w =
+            k_smem.advance_offset_by_column<4, num_vecs_per_head_k>(k_smem_offset_w, fy);
+      k_read_idx += 4 * num_elems_per_128b<CacheT>();
+    }
+    k_smem_offset_w =
+          k_smem.advance_offset_by_row<8 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_w) - 4;
+    k_read_idx += 8 * NUM_WARPS * HEAD_DIM / 2 - 4 * num_elems_per_128b<CacheT>();
+  }
+  commit_group();
+  wait_group<0>();
+  __syncthreads();
+
+  // deal k_smem 64 rows 128 cols
+  for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
+    uint32_t row_idx = wid * 16 + tid / 4;
+    for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 noce, need 2 iter
+      uint32_t col_idx = fy * 64 + tid % 4 * 2;
+      k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
+
+
+      for (int i = 0; i < 2; i++) {
+        T *k_tile_ptr0 = k_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
+        T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;
+        convert_int4(frag_dq_T, k_frag[2 * i]);
+        convert_int4(frag_dq_T + 8, k_frag[2 * i + 1]);
+
+        if (row_idx < end_idx) {
+          k_tile_ptr0[0] = frag_dq_T[0] * cache_k_scale_smem[col_idx] + cache_k_zero_point_smem[col_idx];
+          k_tile_ptr0[1] = frag_dq_T[1] * cache_k_scale_smem[col_idx + 1] + cache_k_zero_point_smem[col_idx + 1];
+          k_tile_ptr0[8] = frag_dq_T[2] * cache_k_scale_smem[col_idx + 8] + cache_k_zero_point_smem[col_idx + 8];
+          k_tile_ptr0[9] = frag_dq_T[3] * cache_k_scale_smem[col_idx + 9] + cache_k_zero_point_smem[col_idx + 9];
+          k_tile_ptr0[16] = frag_dq_T[8] * cache_k_scale_smem[col_idx + 16] + cache_k_zero_point_smem[col_idx + 16];
+          k_tile_ptr0[17] = frag_dq_T[9] * cache_k_scale_smem[col_idx + 17] + cache_k_zero_point_smem[col_idx + 17];
+          k_tile_ptr0[24] = frag_dq_T[10] * cache_k_scale_smem[col_idx + 24] + cache_k_zero_point_smem[col_idx + 24];
+          k_tile_ptr0[25] = frag_dq_T[11] * cache_k_scale_smem[col_idx + 25] + cache_k_zero_point_smem[col_idx + 25];
+        }
+
+        if (row_idx + 8 < end_idx) {
+          k_tile_ptr1[0] = frag_dq_T[4] * cache_k_scale_smem[col_idx] + cache_k_zero_point_smem[col_idx];
+          k_tile_ptr1[1] = frag_dq_T[5] * cache_k_scale_smem[col_idx + 1] + cache_k_zero_point_smem[col_idx + 1];
+          k_tile_ptr1[8] = frag_dq_T[6] * cache_k_scale_smem[col_idx + 8] + cache_k_zero_point_smem[col_idx + 8];
+          k_tile_ptr1[9] = frag_dq_T[7] * cache_k_scale_smem[col_idx + 9] + cache_k_zero_point_smem[col_idx + 9];
+          k_tile_ptr1[16] = frag_dq_T[12] * cache_k_scale_smem[col_idx + 16] + cache_k_zero_point_smem[col_idx + 16];
+          k_tile_ptr1[17] = frag_dq_T[13] * cache_k_scale_smem[col_idx + 17] + cache_k_zero_point_smem[col_idx + 17];
+          k_tile_ptr1[24] = frag_dq_T[14] * cache_k_scale_smem[col_idx + 24] + cache_k_zero_point_smem[col_idx + 24];
+          k_tile_ptr1[25] = frag_dq_T[15] * cache_k_scale_smem[col_idx + 25] + cache_k_zero_point_smem[col_idx + 25];
+        }
+        col_idx += 32;
+      }
+      k_smem_offset_r = k_smem.advance_offset_by_column<2, num_vecs_per_head_k>(
+        k_smem_offset_r, fy);
+    }
+    k_smem_offset_r =
+      k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_r) - 4;
+  }
+
+  // ================v================
+  smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT) / 2);
+  uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
+      wid * 16 + tid / 2, tid % 2);  // 4 * 8 per warp
+
+  uint32_t v_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
+      wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
+
+  uint32_t v_read_idx = (wid * 16 + tid / 2) * BLOCK_SIZE_HALF +
+                          tid % 2 * num_elems_per_128b<CacheT>();
+  // load v_smem 128 rows 64 rows
+  for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
+    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
+      v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
+              v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
+      v_smem_offset_w =
+            v_smem.advance_offset_by_column<2, num_vecs_per_blocksize>(v_smem_offset_w, fz);
+      v_read_idx += 2 * num_elems_per_128b<CacheT>();
+    }
+    v_smem_offset_w =
+          v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_blocksize>(v_smem_offset_w) - 2;
+    v_read_idx += 16 * NUM_WARPS * BLOCK_SIZE_HALF - 2 * num_elems_per_128b<CacheT>();
+  }
+
+  commit_group();
+  wait_group<0>();
+  __syncthreads();
+
+  // deal v_smem 128 rows 64 cols
+  for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
+    uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
+    for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
+      uint32_t kv_idx = fz * 64 + tid % 4 * 2;
+      v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
+      // layout
+      for (int i = 0; i < 2; i++) {
+        T *v_tile_ptr0 = v_write_ptr + kv_idx * kv_t_stride + kv_head_idx * HEAD_DIM + dim_idx;
+        T *v_tile_ptr1 = v_tile_ptr0 + 8;
+
+        convert_int4(frag_dq_T, v_frag[2 * i]);
+        convert_int4(frag_dq_T + 8, v_frag[2 * i + 1]);
+        if (kv_idx < end_idx) {
+          v_tile_ptr0[0] = frag_dq_T[0] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[0] = frag_dq_T[4] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 1 < end_idx) {
+          v_tile_ptr0[kv_t_stride] = frag_dq_T[1] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[kv_t_stride] = frag_dq_T[5] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 8 < end_idx) {
+          v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[8 * kv_t_stride] = frag_dq_T[6] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 9 < end_idx) {
+          v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[9 * kv_t_stride] = frag_dq_T[7] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 16 < end_idx) {
+          v_tile_ptr0[16 * kv_t_stride] = frag_dq_T[8] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[16 * kv_t_stride] = frag_dq_T[12] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 17 < end_idx) {
+          v_tile_ptr0[17 * kv_t_stride] = frag_dq_T[9] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[17 * kv_t_stride] = frag_dq_T[13] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 24 < end_idx) {
+          v_tile_ptr0[24 * kv_t_stride] = frag_dq_T[10] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[24 * kv_t_stride] = frag_dq_T[14] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        if (kv_idx + 25 < end_idx) {
+          v_tile_ptr0[25 * kv_t_stride] = frag_dq_T[11] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
+          v_tile_ptr1[25 * kv_t_stride] = frag_dq_T[15] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+        }
+        kv_idx += 32;
+      }
+      v_smem_offset_r = v_smem.advance_offset_by_column<2, num_vecs_per_blocksize>(
+          v_smem_offset_r, fz);
+    }
+    v_smem_offset_r =
+      v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_blocksize>(v_smem_offset_r) - 2;
+  }
+}
+
 template <typename T, uint32_t HEAD_DIM, uint32_t BLOCK_SIZE>
-void AppendDequantCache(
+void AppendCacheKV(
  const paddle::Tensor &cache_k,
  const paddle::Tensor &cache_v,
  const paddle::Tensor &cache_k_dequant_scales,
  const paddle::Tensor &cache_v_dequant_scales,
+  const paddle::Tensor &cache_k_zp,
+  const paddle::Tensor &cache_v_zp,
  const paddle::Tensor &seq_lens_this_time,
  const paddle::Tensor &seq_lens_decoder,
  const paddle::Tensor &cu_seqlens_k,
@@ -371,19 +773,41 @@ void AppendDequantCache(
  paddle::Tensor *k_out,
  paddle::Tensor *v_out,
  const cudaStream_t& stream
-) {  
+) {
  using NV_TYPE = typename cascade_attn_type_traits<T>::type;
-  if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
-    constexpr int NUM_WARPS = 4;
-    int block_num = cache_num_blocks_x.data<int>()[0];
-    dim3 grids(block_num, 1, kv_num_heads);
-    dim3 blocks(32, NUM_WARPS);
-    
+  constexpr int NUM_WARPS = 4;
+  int block_num = cache_num_blocks_x.data<int>()[0];
+  dim3 grids(block_num, 1, kv_num_heads);
+  dim3 blocks(32, NUM_WARPS);
+  if (cache_quant_type == "none") {
+    const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(T) * 2;
+    auto kernel_func = append_cache_kv_c16<NV_TYPE, NV_TYPE, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
+
+    if (smem_size >= 48 * 1024) {
+      cudaFuncSetAttribute(kernel_func,
+                          cudaFuncAttributeMaxDynamicSharedMemorySize,
+                          smem_size);
+    }
+    kernel_func<<<grids, blocks, smem_size, stream>>>(
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
+          reinterpret_cast<NV_TYPE *>(k_out->data<T>()),
+          reinterpret_cast<NV_TYPE *>(v_out->data<T>()),
+          seq_lens_this_time.data<int>(),
+          seq_lens_decoder.data<int>(),
+          cu_seqlens_k.data<int>(),
+          block_tables.data<int>(),
+          cache_batch_ids.data<int>(),
+          cache_tile_ids_per_batch.data<int>(),
+          max_blocks_per_seq,
+          kv_num_heads
+    );
+  } else if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
    const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) * 2;

-    auto kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
+    auto kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
    if (cache_quant_type == "cache_fp8") {
-      kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
+      kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
    }
    if (smem_size >= 48 * 1024) {
      cudaFuncSetAttribute(kernel_func,
@@ -406,6 +830,34 @@ void AppendDequantCache(
          max_blocks_per_seq,
          kv_num_heads
    );
+  } else if (cache_quant_type == "cache_int4_zp") {
+    const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) + 4 * HEAD_DIM * sizeof(T);
+
+    auto kernel_func = append_cache_kv_c4<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
+
+    if (smem_size >= 48 * 1024) {
+      cudaFuncSetAttribute(kernel_func,
+                          cudaFuncAttributeMaxDynamicSharedMemorySize,
+                          smem_size);
+    }
+    kernel_func<<<grids, blocks, smem_size, stream>>>(
+          cache_k.data<uint8_t>(),
+          cache_v.data<uint8_t>(),
+          reinterpret_cast<NV_TYPE *>(k_out->data<T>()),
+          reinterpret_cast<NV_TYPE *>(v_out->data<T>()),
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_dequant_scales.data<T>())),
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_dequant_scales.data<T>())),
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_zp.data<T>())),
+          reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_zp.data<T>())),
+          seq_lens_this_time.data<int>(),
+          seq_lens_decoder.data<int>(),
+          cu_seqlens_k.data<int>(),
+          block_tables.data<int>(),
+          cache_batch_ids.data<int>(),
+          cache_tile_ids_per_batch.data<int>(),
+          max_blocks_per_seq,
+          kv_num_heads
+    );
  } else {
    PADDLE_THROW("%s mode isn't implemented yet", cache_quant_type.c_str());
  }
@@ -421,8 +873,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& kv_batch_ids,
    const paddle::Tensor& kv_tile_ids,
@@ -450,9 +901,9 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
  const int token_num = qkv_dims[0];
  const int max_blocks_per_seq = block_tables.dims()[1];
  const int block_size = key_cache.dims()[2];
-  const int batch_size = cum_offsets.dims()[0];
+  const int batch_size = seq_lens_this_time.dims()[0];
  const int kv_num_heads = key_cache_dims[1];
-  const int head_dim = key_cache_dims[3];
+  const int head_dim = cache_quant_type == "cache_int4_zp" ? key_cache_dims[3] * 2 : key_cache_dims[3];
  const int num_heads = qkv_dims[qkv_dims.size() - 1] / head_dim - 2 * kv_num_heads;
  const float softmax_scale = 1.f / sqrt(head_dim);

@@ -463,7 +914,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
  meta_data.q_num_heads = num_heads;
  meta_data.max_blocks_per_seq = max_blocks_per_seq;
  meta_data.block_size = block_size;
-  meta_data.batch_size = cum_offsets.dims()[0];
+  meta_data.batch_size = seq_lens_this_time.dims()[0];

  phi::GPUContext* dev_ctx = static_cast<phi::GPUContext*>(phi::DeviceContextPool::Instance().Get(qkv.place()));

@@ -493,9 +944,10 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
        v.data<data_t>(),
        qkv.data<data_t>(),
        rotary_embs.data<float>(),
-        padding_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
        seq_lens_encoder.data<int>(),
        seq_lens_decoder.data<int>(),
+        cu_seqlens_q.data<int>(),
        cu_seqlens_k.data<int>(),
        token_num,
        num_heads,
@@ -510,7 +962,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
      meta_data,
      qkv_out,
      block_tables,
-      padding_offsets,
+      batch_id_per_token,
+      cu_seqlens_q,
      seq_lens_encoder,
      seq_lens_decoder,
      max_seq_len,
@@ -527,8 +980,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
        cache_v_quant_scales.get(),
        seq_lens_this_time,
        seq_lens_decoder,
-        padding_offsets,
-        cum_offsets,
+        batch_id_per_token,
+        cu_seqlens_q,
        block_tables,
        kv_batch_ids,
        kv_tile_ids,
@@ -539,6 +992,32 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
        stream,
        const_cast<paddle::Tensor*>(&key_cache),
        const_cast<paddle::Tensor*>(&value_cache));
+  } else if (cache_quant_type == "cache_int4_zp") {
+    CascadeAppendWriteCacheKVC4QKV<data_t, 128, 64>(
+        meta_data,
+        *const_cast<paddle::Tensor*>(&key_cache),
+        *const_cast<paddle::Tensor*>(&value_cache),
+        qkv_out,
+        cache_k_quant_scales.get(),
+        cache_v_quant_scales.get(),
+        cache_k_zp.get(),
+        cache_v_zp.get(),
+        seq_lens_this_time,
+        seq_lens_decoder,
+        batch_id_per_token,
+        cu_seqlens_q,
+        block_tables,
+        kv_batch_ids,
+        kv_tile_ids,
+        kv_num_blocks_data,
+        max_seq_len,
+        stream,
+        const_cast<paddle::Tensor*>(&key_cache),
+        const_cast<paddle::Tensor*>(&value_cache));
+  } else {
+    PD_THROW(
+        "cache_quant_type_str should be one of [none, cache_int8, cache_fp8, "
+        "cache_int4_zp]");
  }
  const char* fmt_write_cache_completed_signal_str = std::getenv("FLAGS_fmt_write_cache_completed_signal");
  const char* FLAGS_use_pd_disaggregation_per_chunk = std::getenv("FLAGS_use_pd_disaggregation_per_chunk");
@@ -561,11 +1040,13 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
  }

  if (token_num < kv_token_num) {
-    AppendDequantCache<data_t, 128, 64>(
+    AppendCacheKV<data_t, 128, 64>(
      key_cache,
      value_cache,
      cache_k_dequant_scales.get(),
      cache_v_dequant_scales.get(),
+      cache_k_zp.get(),
+      cache_v_zp.get(),
      seq_lens_this_time,
      seq_lens_decoder,
      cu_seqlens_k,
@@ -594,8 +1075,7 @@ PD_BUILD_STATIC_OP(gqa_rope_write_cache)
             "seq_lens_this_time",
             "seq_lens_encoder",
             "seq_lens_decoder",
-             "padding_offsets",
-             "cum_offsets",
+             "batch_id_per_token",
             "block_tables",
             "kv_batch_ids",
             "kv_tile_ids_per_batch",
--- a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once

+#include "helper.h"
 #include "mla_cache_kernel.cuh"

 template <paddle::DataType T>
@@ -22,8 +23,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
                    const paddle::Tensor& kv_pe,
                    const paddle::Tensor& seq_lens,
                    const paddle::Tensor& seq_lens_decoder,
-                    const paddle::Tensor& padding_offsets,
-                    const paddle::Tensor& cum_offsets,
+                    const paddle::Tensor& batch_id_per_token,
+                    const paddle::Tensor& cu_seqlens_q,
                    const paddle::Tensor& block_tables,
                    const int max_seq_len,
                    cudaStream_t& stream,
@@ -53,8 +54,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
          reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
          reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
          block_tables.data<int>(),
-          padding_offsets.data<int>(),
-          cum_offsets.data<int>(),
+          batch_id_per_token.data<int>(),
+          cu_seqlens_q.data<int>(),
          seq_lens.data<int>(),
          seq_lens_decoder.data<int>(),
          max_seq_len,
@@ -73,8 +74,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
    const paddle::Tensor& kv_cache,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const std::string& cache_quant_type_str,
    const int max_seq_len) {
@@ -91,7 +92,7 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(

  meta_data.max_blocks_per_seq = block_tables.dims()[1];
  meta_data.block_size = kv_cache_dims[2];
-  meta_data.batch_size = cum_offsets.dims()[0];
+  meta_data.batch_size = seq_lens_decoder.dims()[0];
  switch (kv_pe.dtype()) {
    case paddle::DataType::BFLOAT16: {
      return PrefillMLAWriteCache<paddle::DataType::BFLOAT16>(meta_data,
@@ -99,8 +100,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
                              kv_pe,
                              seq_lens,
                              seq_lens_decoder,
-                              padding_offsets,
-                              cum_offsets,
+                              batch_id_per_token,
+                              cu_seqlens_q,
                              block_tables,
                              max_seq_len,
                              stream,
@@ -112,8 +113,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
                              kv_pe,
                              seq_lens,
                              seq_lens_decoder,
-                              padding_offsets,
-                              cum_offsets,
+                              batch_id_per_token,
+                              cu_seqlens_q,
                              block_tables,
                              max_seq_len,
                              stream,
@@ -130,8 +131,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
                    const paddle::Tensor& kv_pe,
                    const paddle::Tensor& seq_lens,
                    const paddle::Tensor& seq_lens_encoder,
-                    const paddle::Tensor& padding_offsets,
-                    const paddle::Tensor& cum_offsets,
+                    const paddle::Tensor& batch_id_per_token,
+                    const paddle::Tensor& cu_seqlens_q,
                    const paddle::Tensor& block_tables,
                    const int max_seq_len,
                    const bool speculate_decoder,
@@ -164,8 +165,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
            reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
            reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
            block_tables.data<int>(),
-            padding_offsets.data<int>(),
-            cum_offsets.data<int>(),
+            batch_id_per_token.data<int>(),
+            cu_seqlens_q.data<int>(),
            seq_lens.data<int>(),
            seq_lens_encoder.data<int>(),
            max_seq_len,
@@ -185,7 +186,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
            reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
            reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
            block_tables.data<int>(),
-            cum_offsets.data<int>(),
+            cu_seqlens_q.data<int>(),
            seq_lens.data<int>(),
            seq_lens_encoder.data<int>(),
            max_seq_len,
@@ -205,8 +206,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
    const paddle::Tensor& kv_cache,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const std::string& cache_quant_type_str,
    const int max_seq_len,
@@ -224,7 +225,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(

  meta_data.max_blocks_per_seq = block_tables.dims()[1];
  meta_data.block_size = kv_cache_dims[2];
-  meta_data.batch_size = cum_offsets.dims()[0];
+  meta_data.batch_size = seq_lens_encoder.dims()[0];
  switch (kv_pe.dtype()) {
    case paddle::DataType::BFLOAT16: {
      return DecodeMLAWriteCache<paddle::DataType::BFLOAT16>(meta_data,
@@ -232,8 +233,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
                              kv_pe,
                              seq_lens,
                              seq_lens_encoder,
-                              padding_offsets,
-                              cum_offsets,
+                              batch_id_per_token,
+                              cu_seqlens_q,
                              block_tables,
                              max_seq_len,
                              speculate_decoder,
@@ -246,8 +247,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
                              kv_pe,
                              seq_lens,
                              seq_lens_encoder,
-                              padding_offsets,
-                              cum_offsets,
+                              batch_id_per_token,
+                              cu_seqlens_q,
                              block_tables,
                              max_seq_len,
                              speculate_decoder,
@@ -259,14 +260,14 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
 }


-PD_BUILD_OP(prefill_mla_write_cache)
+PD_BUILD_STATIC_OP(prefill_mla_write_cache)
    .Inputs({"kv_nope",
             "kv_pe",
             "kv_cache",
             "seq_lens",
             "seq_lens_decoder",
-             "padding_offsets",
-             "cum_offsets",
+             "batch_id_per_token",
+             "cu_seqlens_q",
             "block_tables"})
    .Outputs({"kv_cache_out"})
    .SetInplaceMap({{"kv_cache", "kv_cache_out"}})
@@ -274,14 +275,14 @@ PD_BUILD_OP(prefill_mla_write_cache)
            "max_seq_len: int"})
    .SetKernelFn(PD_KERNEL(PrefillMLAWriteCacheKernel));

-PD_BUILD_OP(decode_mla_write_cache)
+PD_BUILD_STATIC_OP(decode_mla_write_cache)
    .Inputs({"kv_nope",
             "kv_pe",
             "kv_cache",
             "seq_lens",
             "seq_lens_encoder",
-             "padding_offsets",
-             "cum_offsets",
+             "batch_id_per_token",
+             "cu_seqlens_q",
             "block_tables"})
    .Outputs({"kv_cache_out"})
    .SetInplaceMap({{"kv_cache", "kv_cache_out"}})
--- a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cuh
+++ b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cuh
@@ -24,7 +24,7 @@ __global__ void decode_absorb_cache_kernel(
    T* __restrict__ kv_cache,    // [num_blocks, kv_num_heads, block_size,
                                  // nope_size]
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const int max_seq_len,
@@ -50,7 +50,7 @@ __global__ void decode_absorb_cache_kernel(
       linear_index += step) {
    const int ori_bi = linear_index / hidden_size;
    const int bias = linear_index % hidden_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    if (seq_lens_encoder[ori_bi] > 0) return;
    const int write_seq_id = seq_lens[ori_bi];

@@ -95,8 +95,8 @@ __global__ void speculate_decode_absorb_cache_kernel(
    T* __restrict__ kv_cache,    // [num_blocks, kv_num_heads, block_size,
                                  // nope_size]
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const int max_seq_len,
@@ -121,10 +121,10 @@ __global__ void speculate_decode_absorb_cache_kernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_id = linear_index / hidden_size;
-    const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
+    const int ori_bi = batch_id_per_token[token_id];
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % hidden_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    const int write_seq_id =
        seq_lens[ori_bi] + token_id - start_token_idx;
    if (write_seq_id == 0) continue;
@@ -143,7 +143,7 @@ __global__ void speculate_decode_absorb_cache_kernel(
          ori_bi,
          seq_lens[ori_bi],
          token_id,
-          cum_offsets[ori_bi]);
+          cu_seqlens_q[ori_bi]);
    }
    if (bias < nope_hidden_size) { // pe
      const uint32_t inner_bias = bias;
@@ -178,8 +178,8 @@ __global__ void prefill_absorb_cache_kernel(
    T* __restrict__ kv_cache,    // [num_blocks, kv_num_heads, block_size,
                                  // nope_size]
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_decoder,  // [bsz]
    const int max_seq_len,
@@ -204,11 +204,9 @@ __global__ void prefill_absorb_cache_kernel(
       linear_index += step) {
    const uint32_t token_idx = linear_index / hidden_size;
    const uint32_t bias = linear_index % hidden_size;
-    const uint32_t ori_token_idx = token_idx + padding_offsets[token_idx];
-    const uint32_t ori_bi = ori_token_idx / max_seq_len;
+    const uint32_t ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
-    const uint32_t ori_seq_id =
-        ori_token_idx % max_seq_len + seq_lens_decoder[ori_bi];
+    const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];

    const int* block_table_now = nullptr;
    block_table_now = block_tables + ori_bi * max_blocks_per_seq;
--- a/custom_ops/gpu_ops/append_attn/multi_head_latent_attention_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/multi_head_latent_attention_kernel.h
@@ -26,8 +26,8 @@ void DecodeMLAAttentionKernel(
    const paddle::optional<paddle::Tensor>& smooth_weight,
    const paddle::Tensor &seq_lens_q, // q_seq_len is 1
    const paddle::Tensor &seq_lens_kv,
-    const paddle::Tensor &padding_offsets,
-    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
+    const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_table,
    int max_seq_len,
    int max_dec_len,
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
@@ -26,8 +26,8 @@ __global__ void append_clear_cache_int8_block(
                                        // block_size, head_size // 2]
    const int* __restrict__ seq_lens,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const int max_seq_len,
    const int max_blocks_per_seq,
@@ -41,10 +41,10 @@ __global__ void append_clear_cache_int8_block(
  const int wid = tid / 32;
  const int lane_id = tid % 32;
  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;

  if (seq_lens_encoder[bid] > 0) return;
@@ -100,8 +100,8 @@ __global__ void append_clear_cache_int4_block(
                                        // block_size, head_size // 2]
    const int* __restrict__ seq_lens,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const int max_seq_len,
    const int max_blocks_per_seq,
@@ -115,10 +115,10 @@ __global__ void append_clear_cache_int4_block(
  const int wid = tid / 32;
  const int lane_id = tid % 32;
  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;

  if (seq_lens_encoder[bid] > 0) return;
@@ -178,8 +178,8 @@ __global__ void append_speculate_cache_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ q_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens_decoder,  // [bsz]
    const float* __restrict__ cos_emb,
    const float* __restrict__ sin_emb,
@@ -214,12 +214,12 @@ __global__ void append_speculate_cache_rope_kernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_id = linear_index / hidden_size;
-    const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
+    const int ori_bi = batch_id_per_token[token_id];
    if (seq_lens_decoder[ori_bi] == 0) continue;
    const int bias = linear_index % hidden_size;
    const int hi = bias / head_size;  // q + k + v
    const int h_bias = bias % head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    const int write_seq_id =
        seq_lens_decoder[ori_bi] + token_id - start_token_idx;
    if (write_seq_id == 0) continue;
@@ -235,7 +235,7 @@ __global__ void append_speculate_cache_rope_kernel(
          ori_bi,
          seq_lens_decoder[ori_bi],
          token_id,
-          cum_offsets[ori_bi]);
+          cu_seqlens_q[ori_bi]);
    }
    const int block_offset = write_seq_id % block_size;

@@ -311,8 +311,8 @@ __global__ void append_speculate_cache_neox_rope_kernel(
                                  // head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens_decoder,  // [bsz]
    const float* __restrict__ cos_emb,
    const float* __restrict__ sin_emb,
@@ -347,12 +347,12 @@ __global__ void append_speculate_cache_neox_rope_kernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_id = linear_index / half_hidden_size;
-    const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
+    const int ori_bi = batch_id_per_token[token_id];
    if (seq_lens_decoder[ori_bi] == 0) continue;
    const int bias = linear_index % half_hidden_size;
    const int hi = bias / half_head_size;  // q + k + v
    const int h_bias = bias % half_head_size;
-    const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
+    const int start_token_idx = cu_seqlens_q[ori_bi];
    const int write_seq_id =
        seq_lens_decoder[ori_bi] + token_id - start_token_idx;
    if (write_seq_id == 0) continue;
@@ -368,7 +368,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
          ori_bi,
          seq_lens_decoder[ori_bi],
          token_id,
-          cum_offsets[ori_bi]);
+          cu_seqlens_q[ori_bi]);
    }
    const int block_offset = write_seq_id % block_size;

@@ -458,8 +458,8 @@ __global__ void append_speculate_cache_int8_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -484,10 +484,10 @@ __global__ void append_speculate_cache_int8_rope_kernel(
  const int wid = tid / 32;
  const int lane_id = tid % 32;
  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;
  int q_head_idx, k_head_idx, v_idx;
  const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -690,8 +690,8 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -716,10 +716,10 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
  const int wid = tid / 32;
  const int lane_id = tid % 32;
  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;
  int q_head_idx, k_head_idx, v_idx;

@@ -1068,8 +1068,8 @@ __global__ void append_speculate_cache_int4_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -1097,10 +1097,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
  const int lane_id = tid % 32;

  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;

  const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -1130,6 +1130,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
    LoadOutScaleT out_scale_vec;
    LoadEmbT cos_emb_vec;
    LoadEmbT sin_emb_vec;
+#pragma unroll
+    for (int v_i = 0; v_i < VecSize; v_i++) {
+      bias_vec[v_i] = 0;
+    }
    const InT* qkv_now = quant_qkv + token_id * hidden_size;
    T* qkv_out_now = qkv_out + token_id * hidden_size;
 #pragma unroll
@@ -1137,8 +1141,8 @@ __global__ void append_speculate_cache_int4_rope_kernel(
         head_bias += 32 * VecSize) {
      const int bias_idx = head_idx * HeadDim + head_bias;
      Load<InT, VecSize>(&qkv_now[bias_idx], &src_vec);
-      Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
-      Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
+      // Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
+      // Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
      // q rope
      const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
      Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
@@ -1148,10 +1152,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
        // dequant + add_bias + rope
        float input_left = static_cast<float>(src_vec[2 * i]);
        float input_right = static_cast<float>(src_vec[2 * i + 1]);
-        input_left = input_left * out_scale_vec[2 * i] +
-                     static_cast<float>(bias_vec[2 * i]);
-        input_right = input_right * out_scale_vec[2 * i + 1] +
-                      static_cast<float>(bias_vec[2 * i + 1]);
+        // input_left = input_left * out_scale_vec[2 * i] +
+        //              static_cast<float>(bias_vec[2 * i]);
+        // input_right = input_right * out_scale_vec[2 * i + 1] +
+        //               static_cast<float>(bias_vec[2 * i + 1]);
        const float cos_tmp = cos_emb_vec[i];
        const float sin_tmp = sin_emb_vec[i];
        bias_vec[2 * i] =
@@ -1167,6 +1171,35 @@ __global__ void append_speculate_cache_int4_rope_kernel(
    using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
    const uint32_t kv_head_idx = (head_idx - num_heads) % gqa_group_size;

+    if (block_offset == 0) {
+      // pad zero for this kv_head_idx for this block
+      LoadPadKVT pad_cache_vec;
+      *(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
+      if (head_idx < num_heads + gqa_group_size) {
+        constexpr int num_vecs_per_head_dim = half_head_size / KV_VEC_SIZE;
+        constexpr int num_token_each_time = 32 / num_vecs_per_head_dim;
+        const uint32_t tgt_idx = (block_idx * gqa_group_size + kv_head_idx) *
+                                     block_size * half_head_size +
+                                 lane_id % num_vecs_per_head_dim * KV_VEC_SIZE;
+        for (int block_i = lane_id / num_vecs_per_head_dim;
+             block_i < block_size;
+             block_i += num_token_each_time) {
+          Store<uint8_t, KV_VEC_SIZE>(
+              pad_cache_vec, &key_cache[tgt_idx + block_i * half_head_size]);
+        }
+      } else {
+        const int num_vecs_per_head_dim = half_block_size / KV_VEC_SIZE;
+        const int num_token_each_time = 32 / num_vecs_per_head_dim;
+        const uint32_t tgt_idx = (block_idx * gqa_group_size + kv_head_idx) *
+                                     HeadDim * half_block_size +
+                                 lane_id % num_vecs_per_head_dim * KV_VEC_SIZE;
+        for (int block_i = lane_id / num_vecs_per_head_dim; block_i < HeadDim;
+             block_i += num_token_each_time) {
+          Store<uint8_t, KV_VEC_SIZE>(
+              pad_cache_vec, &value_cache[tgt_idx + block_i * half_block_size]);
+        }
+      }
+    }
    constexpr int K_VEC_SIZE = 4;
    constexpr int HALF_K_VEC_SIZE = 2;
    using LoadKVResT = AlignedVector<uint8_t, K_VEC_SIZE>;
@@ -1182,7 +1215,11 @@ __global__ void append_speculate_cache_int4_rope_kernel(
    LoadScaleT zp_vec1, zp_vec2;
    LoadEmbT cos_emb_vec1, cos_emb_vec2;
    LoadEmbT sin_emb_vec1, sin_emb_vec2;
-
+#pragma unroll
+    for (int v_i = 0; v_i < HALF_K_VEC_SIZE; v_i++) {
+      bias_vec1[v_i] = 0;
+      bias_vec2[v_i] = 0;
+    }
    const InT* qkv_now = quant_qkv + token_id * hidden_size;
    const int head_bias = lane_id / 4 * 16 + lane_id % 4 * 2;
    //////////
@@ -1191,11 +1228,11 @@ __global__ void append_speculate_cache_int4_rope_kernel(
    Load<InT, HALF_K_VEC_SIZE>(&qkv_now[bias_idx], &src_vec1);
    Load<InT, HALF_K_VEC_SIZE>(&qkv_now[bias_idx + 8], &src_vec2);
    /////
-    Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx], &bias_vec1);
-    Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx + 8], &bias_vec2);
-    Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx], &out_scale_vec1);
-    Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx + 8],
-                                 &out_scale_vec2);
+    // Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx], &bias_vec1);
+    // Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx + 8], &bias_vec2);
+    // Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx], &out_scale_vec1);
+    // Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx + 8],
+    //                              &out_scale_vec2);
    if (head_idx < num_heads + gqa_group_size) {
      const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
      Load<float, 1>(&cos_emb[emb_idx], &cos_emb_vec1);
@@ -1215,10 +1252,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(

    float input_left = static_cast<float>(src_vec1[0]);
    float input_right = static_cast<float>(src_vec1[1]);
-    input_left =
-        input_left * out_scale_vec1[0] + static_cast<float>(bias_vec1[0]);
-    input_right =
-        input_right * out_scale_vec1[1] + static_cast<float>(bias_vec1[1]);
+    // input_left =
+    //     input_left * out_scale_vec1[0] + static_cast<float>(bias_vec1[0]);
+    // input_right =
+    //     input_right * out_scale_vec1[1] + static_cast<float>(bias_vec1[1]);
    if (head_idx < num_heads + gqa_group_size) {
      float cos_tmp = cos_emb_vec1[0];
      float sin_tmp = sin_emb_vec1[0];
@@ -1233,10 +1270,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(

    input_left = static_cast<float>(src_vec2[0]);
    input_right = static_cast<float>(src_vec2[1]);
-    input_left =
-        input_left * out_scale_vec2[0] + static_cast<float>(bias_vec2[0]);
-    input_right =
-        input_right * out_scale_vec2[1] + static_cast<float>(bias_vec2[1]);
+    // input_left =
+    //     input_left * out_scale_vec2[0] + static_cast<float>(bias_vec2[0]);
+    // input_right =
+    //     input_right * out_scale_vec2[1] + static_cast<float>(bias_vec2[1]);
    if (head_idx < num_heads + gqa_group_size) {
      float cos_tmp = cos_emb_vec2[0];
      float sin_tmp = sin_emb_vec2[0];
@@ -1374,8 +1411,8 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
                                        // block_size, head_size // 2]
    T* __restrict__ qkv_out,
    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
-    const int* __restrict__ padding_offsets,  // [num_tokens]
-    const int* __restrict__ cum_offsets,
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
    const int* __restrict__ seq_lens,          // [bsz]
    const int* __restrict__ seq_lens_encoder,  // [bsz]
    const float* __restrict__ cos_emb,
@@ -1403,10 +1440,10 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
  const int lane_id = tid % 32;

  const int token_id = blockIdx.x;
-  const int ori_token_id = token_id + padding_offsets[token_id];
-  const int bid = ori_token_id / max_seq_len;

-  const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
+  const int bid = batch_id_per_token[token_id];
+
+  const int start_token_idx = cu_seqlens_q[bid];
  const int head_idx = blockIdx.y * NUM_WARPS + wid;

  const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -1792,4 +1829,4 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
          (uint_quant_value2 << 4) | (uint_quant_value1 & 0x0F);
    }
  }
-}
+}
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
@@ -22,8 +22,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
                                 T* value_cache,
                                 T* qkv_out,
                                 const int* block_tables,
-                                 const int* padding_offsets,
-                                 const int* cum_offsets,
+                                 const int* batch_id_per_token,
+                                 const int* cu_seqlens_q,
                                 const int* seq_lens,
                                 const int* seq_lens_encoder,
                                 const float* cos_emb,
@@ -59,8 +59,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
            value_cache,
            qkv_out,
            block_tables,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            seq_lens,
            cos_emb,
            sin_emb,
@@ -82,8 +82,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
            value_cache,
            qkv_out,
            block_tables,
-            padding_offsets,
-            cum_offsets,
+            batch_id_per_token,
+            cu_seqlens_q,
            seq_lens,
            cos_emb,
            sin_emb,
@@ -106,8 +106,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
                                      uint8_t* value_cache,
                                      T* qkv_out,
                                      const int* block_tables,
-                                      const int* padding_offsets,
-                                      const int* cum_offsets,
+                                      const int* batch_id_per_token,
+                                      const int* cu_seqlens_q,
                                      const int* seq_lens,
                                      const int* seq_lens_encoder,
                                      const float* cos_emb,
@@ -136,8 +136,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
                                             value_cache,
                                             seq_lens,
                                             block_tables,
-                                             padding_offsets,
-                                             cum_offsets,
+                                             batch_id_per_token,
+                                             cu_seqlens_q,
                                             seq_lens_encoder,
                                             max_seq_len,
                                             max_blocks_per_seq,
@@ -151,8 +151,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
                                               value_cache,
                                               qkv_out,
                                               block_tables,
-                                               padding_offsets,
-                                               cum_offsets,
+                                               batch_id_per_token,
+                                               cu_seqlens_q,
                                               seq_lens,
                                               seq_lens_encoder,
                                               cos_emb,
@@ -175,8 +175,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
                                               value_cache,
                                               qkv_out,
                                               block_tables,
-                                               padding_offsets,
-                                               cum_offsets,
+                                               batch_id_per_token,
+                                               cu_seqlens_q,
                                               seq_lens,
                                               seq_lens_encoder,
                                               cos_emb,
@@ -201,8 +201,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
                                      uint8_t* value_cache,
                                      T* qkv_out,
                                      const int* block_tables,
-                                      const int* padding_offsets,
-                                      const int* cum_offsets,
+                                      const int* batch_id_per_token,
+                                      const int* cu_seqlens_q,
                                      const int* seq_lens,
                                      const int* seq_lens_encoder,
                                      const float* cos_emb,
@@ -233,8 +233,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
                                             value_cache,
                                             seq_lens,
                                             block_tables,
-                                             padding_offsets,
-                                             cum_offsets,
+                                             batch_id_per_token,
+                                             cu_seqlens_q,
                                             seq_lens_encoder,
                                             max_seq_len,
                                             max_blocks_per_seq,
@@ -248,8 +248,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
                                               value_cache,
                                               qkv_out,
                                               block_tables,
-                                               padding_offsets,
-                                               cum_offsets,
+                                               batch_id_per_token,
+                                               cu_seqlens_q,
                                               seq_lens,
                                               seq_lens_encoder,
                                               cos_emb,
@@ -274,8 +274,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
                                               value_cache,
                                               qkv_out,
                                               block_tables,
-                                               padding_offsets,
-                                               cum_offsets,
+                                               batch_id_per_token,
+                                               cu_seqlens_q,
                                               seq_lens,
                                               seq_lens_encoder,
                                               cos_emb,
@@ -301,8 +301,8 @@ void SpeculateWriteCacheWithRoPEKernel(
    const paddle::Tensor& qkv,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -349,8 +349,8 @@ void SpeculateWriteCacheWithRoPEKernel(
        reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
        reinterpret_cast<DataType_*>(qkv_out->data<T>()),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -376,8 +376,8 @@ void SpeculateWriteCacheWithRoPEKernel(
        value_cache_out->data<uint8_t>(),
        reinterpret_cast<DataType_*>(qkv_out->data<T>()),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -409,8 +409,8 @@ void SpeculateWriteCacheWithRoPEKernel(
        value_cache_out->data<uint8_t>(),
        reinterpret_cast<DataType_*>(qkv_out->data<T>()),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -442,8 +442,8 @@ void SpeculateWriteCacheWithRoPEKernel(
        value_cache_out->data<uint8_t>(),
        reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
        block_tables.data<int>(),
-        padding_offsets.data<int>(),
-        cum_offsets.data<int>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
        seq_lens.data<int>(),
        seq_lens_encoder.data<int>(),
        cos_emb,
@@ -488,8 +488,8 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
              // gqa_group_size, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -514,8 +514,8 @@ SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
              // gqa_group_size, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -539,8 +539,8 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int>(
              // gqa_group_size, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -566,8 +566,8 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
              // gqa_group_size, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -582,4 +582,4 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out);
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
@@ -23,8 +23,8 @@ void SpeculateWriteCacheWithRoPEKernel(
              // gqa_group_size, head_dim] if GQA)
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::optional<paddle::Tensor>& rotary_embs,
    const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -39,4 +39,4 @@ void SpeculateWriteCacheWithRoPEKernel(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu
@@ -37,8 +37,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_fp8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_bfloat16_int8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c16_float16_int8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, int8_t>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_bfloat16_int8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu
@@ -37,8 +37,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c4_float16_int8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu
@@ -38,8 +38,8 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -85,8 +85,8 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -80,8 +80,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_bfloat16_int8_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -82,8 +82,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, f
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -82,8 +82,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, t
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -81,8 +81,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/append_attention_c8_float16_int8_kerne.cu
@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
@@ -81,8 +81,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
    const paddle::Tensor& seq_lens_q,
    const paddle::Tensor& seq_lens_kv,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_table,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids_per_batch,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
@@ -22,8 +22,8 @@ EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& seq_lens_encoder,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& batch_ids,
    const paddle::Tensor& tile_ids,
--- a/custom_ops/gpu_ops/common/cudaUtils.h
+++ b/custom_ops/gpu_ops/common/cudaUtils.h
@@ -30,4 +30,4 @@ inline int getSMVersion()
    return sm_major * 10 + sm_minor;
 }

-}
+}
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -54,7 +54,7 @@ std::vector<paddle::Tensor> AppendAttention(
    const paddle::Tensor &value_cache, const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_this_time,
-    const paddle::Tensor &padding_offsets, const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token, const paddle::Tensor &cu_seqlens_q,
    const paddle::Tensor &block_tables, const paddle::Tensor &encoder_batch_ids,
    const paddle::Tensor &encoder_tile_ids_per_batch,
    const paddle::Tensor &encoder_num_blocks,
@@ -94,7 +94,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
    const paddle::Tensor &seq_lens_this_time,
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor &padding_offsets, const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &batch_id_per_token,
    const paddle::Tensor &block_tables, const paddle::Tensor &kv_batch_ids,
    const paddle::Tensor &kv_tile_ids, const paddle::Tensor &kv_num_blocks,
    const paddle::Tensor &cache_batch_ids, const paddle::Tensor &cache_tile_ids,
@@ -116,11 +116,11 @@ PreCacheLenConcat(const paddle::Tensor &seq_lens_decoder,

 paddle::Tensor FusedExpertMoeFunc(
    const paddle::Tensor &input, const paddle::Tensor &gate_weight,
-    const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
-    const paddle::optional<paddle::Tensor> &ffn1_bias,
-    const paddle::optional<paddle::Tensor> &ffn1_scale,
-    const paddle::optional<paddle::Tensor> &ffn2_bias,
-    const paddle::optional<paddle::Tensor> &ffn2_scale,
+    const paddle::Tensor &up_gate_proj_weight, const paddle::Tensor &down_proj_weight,
+    const paddle::optional<paddle::Tensor> &up_gate_proj_bias,
+    const paddle::optional<paddle::Tensor> &up_gate_proj_scale,
+    const paddle::optional<paddle::Tensor> &down_proj_bias,
+    const paddle::optional<paddle::Tensor> &down_proj_scale,
    const std::string &quant_method, const int moe_topk,
    const bool norm_topk_prob, const bool group_moe);

@@ -149,7 +149,7 @@ MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
 std::vector<paddle::Tensor>
 EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
                    const paddle::Tensor &topk_weights,
-                    const paddle::optional<paddle::Tensor> &ffn1_in_scale,
+                    const paddle::optional<paddle::Tensor> &up_gate_proj_in_scale,
                    const std::vector<int> &token_nums_per_expert,
                    const int token_nums_this_rank,
                    const std::string &moe_quant_type);
@@ -158,7 +158,8 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
    const paddle::Tensor &input, const paddle::Tensor &scale,
    const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
    const paddle::Tensor &token_nums_per_expert,
-    const paddle::Tensor &token_nums_per_expert_padded);
+    const paddle::Tensor &token_nums_per_expert_padded,
+    const bool use_in_ep, const int token_nums_this_rank_padded);

 std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
                                          const int block_size);
@@ -172,7 +173,7 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
    const paddle::Tensor &ffn_out, const paddle::Tensor &expert_scales_float,
    const paddle::Tensor &permute_indices_per_token,
    const paddle::Tensor &top_k_indices,
-    const paddle::optional<paddle::Tensor> &ffn2_bias,
+    const paddle::optional<paddle::Tensor> &down_proj_bias,
    const bool norm_topk_prob, const float routed_scaling_factor);

 std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
@@ -181,35 +182,35 @@ std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
 paddle::Tensor MoeExpertFFNFunc(
    const paddle::Tensor& permute_input,
    const paddle::Tensor& tokens_expert_prefix_sum,
-    const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
-    const paddle::optional<paddle::Tensor>& ffn1_bias,
-    const paddle::optional<paddle::Tensor>& ffn1_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_in_scale,
+    const paddle::Tensor& up_gate_proj_weight, const paddle::Tensor& down_proj_weight,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_in_scale,
    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
    const std::string& quant_method, const bool used_in_ep_low_latency);

 paddle::Tensor MoeExpertFFNWint2Func(
    const paddle::Tensor& permute_input,
    const paddle::Tensor& tokens_expert_prefix_sum,
-    const paddle::Tensor& ffn1_weight,
-    const paddle::Tensor& ffn2_weight,
-    const paddle::optional<paddle::Tensor>& ffn1_bias,
-    const paddle::optional<paddle::Tensor>& ffn1_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_scale,
-    const paddle::optional<paddle::Tensor>& ffn1_local_scale,
-    const paddle::optional<paddle::Tensor>& ffn1_code_scale,
-    const paddle::optional<paddle::Tensor>& ffn1_code_zp,
-    const paddle::optional<paddle::Tensor>& ffn2_local_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_code_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_code_zp,
+    const paddle::Tensor& up_gate_proj_weight,
+    const paddle::Tensor& down_proj_weight,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_scale,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_local_scale,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_code_scale,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_code_zp,
+    const paddle::optional<paddle::Tensor>& down_proj_local_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_code_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_code_zp,
    const bool used_in_ep_low_latency);

 paddle::Tensor MoeExpertReduceFunc(
    const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
    const paddle::Tensor &permute_indices_per_token,
    const paddle::Tensor &top_k_indices,
-    const paddle::optional<paddle::Tensor> &ffn2_bias,
+    const paddle::optional<paddle::Tensor> &down_proj_bias,
    const bool norm_topk_prob, const float routed_scaling_factor);

 void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
@@ -233,7 +234,7 @@ paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
 std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
-    const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &seq_lens_this_time,
    const int encoder_block_shape_q, const int decoder_block_shape_q,
    const int group_size, const int block_size,
    const int decoder_step_token_num);
@@ -283,6 +284,32 @@ void UpdateInputes(const paddle::Tensor &stop_flags,
                   const paddle::Tensor &next_tokens,
                   const paddle::Tensor &is_block_step);

+void UpdateInputesV1(const paddle::Tensor &stop_flags,
+                   const paddle::Tensor &not_need_stop,  // only on cpu
+                   const paddle::Tensor &seq_lens_this_time,
+                   const paddle::Tensor &seq_lens_encoder,
+                   const paddle::Tensor &seq_lens_decoder,
+                   const paddle::Tensor &step_seq_lens_decoder,
+                   const paddle::Tensor &prompt_lens,
+                   const paddle::Tensor &topk_ids,
+                   const paddle::Tensor &input_ids,
+                   const paddle::Tensor &block_tables,
+                   const paddle::Tensor &stop_nums,
+                   const paddle::Tensor &next_tokens,
+                   const paddle::Tensor &is_block_step,
+                   const int block_size);
+
+void RecoverDecodeTask(const paddle::Tensor &stop_flags,
+                   const paddle::Tensor &seq_lens_this_time,
+                   const paddle::Tensor &seq_lens_encoder,
+                   const paddle::Tensor &seq_lens_decoder,
+                   const paddle::Tensor &step_seq_lens_decoder,
+                   const paddle::Tensor &block_tables,
+                   const paddle::Tensor &is_block_step,
+                   const int block_size);
+
+
+
 paddle::Tensor
 GroupSwigluWithMasked(const paddle::Tensor &fc1_out_tensor,
                      const paddle::Tensor &token_nums_per_expert);
@@ -329,8 +356,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
    const paddle::Tensor& kv_cache,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_encoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const std::string& cache_quant_type_str,
    const int max_seq_len,
@@ -342,8 +369,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
    const paddle::Tensor& kv_cache,
    const paddle::Tensor& seq_lens,
    const paddle::Tensor& seq_lens_decoder,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
+    const paddle::Tensor& cu_seqlens_q,
    const paddle::Tensor& block_tables,
    const std::string& cache_quant_type_str,
    const int max_seq_len);
@@ -368,8 +395,7 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
    const paddle::Tensor& seq_lens_decoder,
    const paddle::Tensor& seq_lens_this_time,
    const paddle::Tensor& cu_seqlens_q,
-    const paddle::Tensor& padding_offsets,
-    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& batch_id_per_token,
    const paddle::Tensor& block_tables,
    const paddle::Tensor& encoder_batch_ids,
    const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -468,6 +494,268 @@ std::vector<paddle::Tensor> NoauxTc(
      int topk,
      float routed_scaling_factor);

+#ifdef ENABLE_FP8
+paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    const paddle::optional<paddle::Tensor>& bias,
+    bool trans_x,
+    bool trans_y,
+    float scale,  // only support per-tensor quantization
+    std::string output_dtype,
+    std::string activation_type);
+
+paddle::Tensor MoeFusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const paddle::Tensor &scale,
+                const paddle::Tensor &topk_ids,
+                const int top_k,
+                const int intermediate_size,
+                const bool tiled);
+
+paddle::Tensor FusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const float scale);
+#endif
+
+int64_t init_custom_all_reduce(const std::vector<int64_t>& fake_ipc_ptrs,
+                      paddle::Tensor& rank_data, int64_t rank, bool full_nvlink);
+
+void all_reduce(int64_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
+                int64_t reg_buffer, int64_t reg_buffer_sz_bytes);
+
+void dispose(int64_t _fa);
+
+int64_t meta_size();
+
+void register_buffer(int64_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(int64_t _fa);
+
+void register_graph_buffers(int64_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets);
+
+std::tuple<int64_t, paddle::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+
+int64_t open_mem_handle(paddle::Tensor& mem_handle);
+
+void free_shared_buffer(int64_t buffer);
+
+// speculative decoding Kernel
+std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
+    const paddle::Tensor& input_ids,
+    const paddle::Tensor& draft_tokens,
+    const paddle::Tensor& cum_offsets,
+    const paddle::Tensor& token_num,
+    const paddle::Tensor& seq_len,
+    const paddle::Tensor& seq_lens_encoder);
+
+std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
+    const paddle::Tensor& seq_lens_this_time,
+    const paddle::Tensor& seq_lens_encoder,
+    const paddle::Tensor& seq_lens_decoder);
+
+std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
+    const paddle::Tensor& output_cum_offsets_tmp,
+    const paddle::Tensor& out_token_num,
+    const paddle::Tensor& seq_lens_output,
+    const int max_seq_len);
+
+
+void SpecTokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
+                             const paddle::Tensor &logits,
+                             const paddle::Tensor &penalty_scores,
+                             const paddle::Tensor &frequency_scores,
+                             const paddle::Tensor &presence_scores,
+                             const paddle::Tensor &temperatures,
+                             const paddle::Tensor &bad_tokens,
+                             const paddle::Tensor &cur_len,
+                             const paddle::Tensor &min_len,
+                             const paddle::Tensor &eos_token_id,
+                             const paddle::Tensor &seq_lens_this_time,
+                             const paddle::Tensor &output_padding_offset,
+                             const paddle::Tensor &output_cum_offsets,
+                             const int max_seq_len);
+
+void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
+                               const paddle::Tensor &accept_num,
+                               const paddle::Tensor &pre_ids,
+                               const paddle::Tensor &step_idx,
+                               const paddle::Tensor &stop_flags,
+                               const paddle::Tensor &seq_lens,
+                               const paddle::Tensor &stop_seqs,
+                               const paddle::Tensor &stop_seqs_len,
+                               const paddle::Tensor &end_ids);
+
+
+void SpeculateVerify(
+    const paddle::Tensor &accept_tokens, const paddle::Tensor &accept_num,
+    const paddle::Tensor &step_idx, const paddle::Tensor &stop_flags,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &draft_tokens,
+    const paddle::Tensor &seq_lens_this_time,
+    const paddle::Tensor &verify_tokens, const paddle::Tensor &verify_scores,
+    const paddle::Tensor &max_dec_len, const paddle::Tensor &end_tokens,
+    const paddle::Tensor &is_block_step,
+    const paddle::Tensor &output_cum_offsets,
+    const paddle::Tensor &actual_candidate_len,
+    const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp,
+    int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode);
+
+void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
+                       const paddle::Tensor &seq_lens_decoder,
+                       const paddle::Tensor &not_need_stop,
+                       const paddle::Tensor &draft_tokens,
+                       const paddle::Tensor &actual_draft_token_nums,
+                       const paddle::Tensor &accept_tokens,
+                       const paddle::Tensor &accept_num,
+                       const paddle::Tensor &stop_flags,
+                       const paddle::Tensor &seq_lens_this_time,
+                       const paddle::Tensor &is_block_step,
+                       const paddle::Tensor &stop_nums);
+
+void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
+                                    const paddle::Tensor &accept_tokens,
+                                    const paddle::Tensor &accept_num,
+                                    const paddle::Tensor &stop_flags,
+                                    const paddle::Tensor &seq_lens_this_time,
+                                    const paddle::Tensor &seq_lens_encoder,
+                                    const paddle::Tensor &seq_lens_decoder,
+                                    const paddle::Tensor &step_idx);
+
+void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
+                                      const paddle::Tensor& accept_num,
+                                      const paddle::Tensor& not_need_stop,
+                                      int64_t rank_id,
+                                      bool save_each_rank);
+
+
+void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
+                              const paddle::Tensor& seq_lens_decoder);
+
+void NgramMatch(const paddle::Tensor &input_ids,
+        const paddle::Tensor &input_ids_len,
+        const paddle::Tensor &pre_ids,
+        const paddle::Tensor &step_idx,
+        const paddle::Tensor &draft_token_num,
+        const paddle::Tensor &draft_tokens,
+        const paddle::Tensor &seq_lens_this_time,
+        const paddle::Tensor &seq_lens_encoder,
+        const paddle::Tensor &seq_lens_decoder,
+        const paddle::Tensor &max_dec_len,
+        const int max_ngram_size,
+        const int max_draft_tokens);
+
+
+// MTP
+void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
+                           const paddle::Tensor& base_model_seq_lens_this_time,
+                           const paddle::Tensor& base_model_seq_lens_encoder,
+                           const paddle::Tensor& base_model_stop_flags);
+
+
+void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
+                          const paddle::Tensor& input_ids,
+                          const paddle::Tensor& stop_flags,
+                          const paddle::Tensor& seq_lens_this_time,
+                          const paddle::Tensor& seq_lens_encoder,
+                          const paddle::Tensor& seq_lens_decoder,
+                          const paddle::Tensor& step_idx,
+                          const paddle::Tensor& not_need_stop,
+                          const paddle::Tensor& batch_drop,
+                          const paddle::Tensor& accept_tokens,
+                          const paddle::Tensor& accept_num,
+                          const paddle::Tensor& base_model_seq_lens_encoder,
+                          const paddle::Tensor& base_model_seq_lens_decoder,
+                          const paddle::Tensor& base_model_step_idx,
+                          const paddle::Tensor& base_model_stop_flags,
+                          const paddle::Tensor& base_model_is_block_step,
+                          const paddle::Tensor& base_model_draft_tokens,
+                          const int max_draft_token,
+                          const bool truncate_first_token,
+                          const bool splitwise_prefill);
+
+
+void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
+                      const paddle::Tensor& draft_tokens,
+                      const paddle::Tensor& pre_ids,
+                      const paddle::Tensor& seq_lens_this_time,
+                      const paddle::Tensor& seq_lens_encoder,
+                      const paddle::Tensor& seq_lens_decoder,
+                      const paddle::Tensor& step_idx,
+                      const paddle::Tensor& output_cum_offsets,
+                      const paddle::Tensor& stop_flags,
+                      const paddle::Tensor& not_need_stop,
+                      const paddle::Tensor& max_dec_len,
+                      const paddle::Tensor& end_ids,
+                      const paddle::Tensor& base_model_draft_tokens,
+                      const int max_seq_len,
+                      const int substep);
+
+
+
+std::vector<paddle::Tensor> EagleGetHiddenStates(
+                                const paddle::Tensor& input,
+                                const paddle::Tensor& seq_lens_this_time,
+                                const paddle::Tensor& seq_lens_encoder,
+                                const paddle::Tensor& seq_lens_decoder,
+                                const paddle::Tensor& stop_flags,
+                                const paddle::Tensor& accept_nums,
+                                const paddle::Tensor& base_model_seq_lens_this_time,
+                                const paddle::Tensor& base_model_seq_lens_encoder,
+                                const int actual_draft_token_num);
+
+std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
+                    const paddle::Tensor& input,
+                    const paddle::Tensor& last_seq_lens_this_time,
+                    const paddle::Tensor& seq_lens_this_time,
+                    const paddle::Tensor& step_idx);
+
+void MTPStepPaddle(
+    const paddle::Tensor &base_model_stop_flags,
+    const paddle::Tensor &stop_flags,
+    const paddle::Tensor &batch_drop,
+    const paddle::Tensor &seq_lens_this_time,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::Tensor &seq_lens_decoder,
+    const paddle::Tensor &block_tables,  // [bsz, block_num_per_seq]
+    const paddle::Tensor &encoder_block_lens,
+    const paddle::Tensor &used_list_len,
+    const paddle::Tensor &free_list,
+    const paddle::Tensor &free_list_len,
+    const int block_size,
+    const int max_draft_tokens);
+
+void SpeculateStepPaddle(
+    const paddle::Tensor &stop_flags,
+    const paddle::Tensor &seq_lens_this_time,
+    const paddle::Tensor &ori_seq_lens_encoder,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::Tensor &seq_lens_decoder,
+    const paddle::Tensor &block_tables,  // [bsz, block_num_per_seq]
+    const paddle::Tensor &encoder_block_lens,
+    const paddle::Tensor &is_block_step,
+    const paddle::Tensor &step_block_list,
+    const paddle::Tensor &step_lens,
+    const paddle::Tensor &recover_block_list,
+    const paddle::Tensor &recover_lens,
+    const paddle::Tensor &need_block_list,
+    const paddle::Tensor &need_block_len,
+    const paddle::Tensor &used_list_len,
+    const paddle::Tensor &free_list,
+    const paddle::Tensor &free_list_len,
+    const paddle::Tensor &input_ids,
+    const paddle::Tensor &pre_ids,
+    const paddle::Tensor &step_idx,
+    const paddle::Tensor &next_tokens,
+    const paddle::Tensor &first_token_ids,
+    const paddle::Tensor &accept_num,
+    const int block_size,
+    const int encoder_decoder_block_num,
+    const int max_draft_tokens);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -477,7 +765,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   * moe/fused_moe/moe_redundant_topk_select.cu
   * moe_redundant_topk_select
   */
-  m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
+  m.def("moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
        py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
        py::arg("expert_in_rank_num_list"),
        py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
@@ -559,7 +847,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   * ep_moe_dispatch
   */
  m.def("ep_moe_expert_dispatch", &EPMoeExpertDispatch, py::arg("input"),
-        py::arg("topk_ids"), py::arg("topk_weights"), py::arg("ffn1_in_scale"),
+        py::arg("topk_ids"), py::arg("topk_weights"), py::arg("up_gate_proj_in_scale"),
        py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
        py::arg("moe_quant_type"), "ep moe export dispatch function");

@@ -567,7 +855,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
        py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
-        py::arg("top_k_indices"), py::arg("ffn2_bias"),
+        py::arg("top_k_indices"), py::arg("down_proj_bias"),
        py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"),
        "ep moe export combine function");

@@ -609,7 +897,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   */
  m.def("moe_expert_reduce", &MoeExpertReduceFunc, py::arg("ffn_out"),
        py::arg("top_k_weight"), py::arg("permute_indices_per_token"),
-        py::arg("top_k_indices"), py::arg("ffn2_bias"),
+        py::arg("top_k_indices"), py::arg("down_proj_bias"),
        py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"),
        "moe export reduce function");

@@ -637,9 +925,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   * append_attn/get_block_shape_and_split_kv_block.cu
   * get_block_shape_and_split_kv_block
   */
-  // m.def("f_get_block_shape_and_split_kv_block",
-  // &GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block
-  // function");
+  m.def("get_block_shape_and_split_kv_block",
+  &GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block function");

  /**
   * get_padding_offset.cu
@@ -680,6 +967,18 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   */
  m.def("update_inputs", &UpdateInputes, "update_inputs function");

+   /**
+   * update_inputs_v1.cu
+   * update_inputs_v1
+   */
+  m.def("update_inputs_v1", &UpdateInputesV1, "update inputs for scheduler v1 function");
+
+     /**
+   * recover_decode_task.cu
+   * recover_decode_task
+   */
+  m.def("recover_decode_task", &RecoverDecodeTask, "recover decode task for scheduler v1 function");
+
  /**
   * extract_text_token_output.cu
   * extract_text_token_output
@@ -700,35 +999,17 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);

  m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
-  py::arg("a"),
-  py::arg("c_or_none"),
-  py::arg("b_q_weight"),
-  py::arg("b_scales"),
-  py::arg("global_scale_or_none"),
-  py::arg("b_zeros_or_none"),
-  py::arg("g_idx_or_none"),
-  py::arg("perm_or_none"),
-  py::arg("workspace"),
-  py::arg("sorted_token_ids"),
-  py::arg("expert_ids"),
-    py::arg("num_tokens_post_padded"),
-  py::arg("topk_weights"),
-  py::arg("moe_block_size"),
-    py::arg("top_k"),
-      py::arg("mul_topk_weights"),
-        py::arg("is_ep"),
-          py::arg("b_q_type_str"),
-            py::arg("size_m"),
-              py::arg("size_n"),
-              py::arg("size_k"),
-              py::arg("is_k_full"),
-              py::arg("use_atomic_add"),
-              py::arg("use_fp32_reduce"),
-              py::arg("is_zp_float"));
+      py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"),
+      py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"),
+      py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"),
+      py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"),
+      py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"),  py::arg("b_q_type_str"),
+      py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"),
+      py::arg("use_fp32_reduce"), py::arg("is_zp_float"));
+
  m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch,
        "get_position_ids_and_mask_encoder_batch function");

-
  /**
   * cutlass_scaled_mm.cu
   * cutlass_scaled_mm
@@ -762,4 +1043,73 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function");

  m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
+
+#ifdef ENABLE_FP8
+  m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func,
+        py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"),
+        py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"),
+        py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function");
+  m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), py::arg("topk_ids"),
+      py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function");
+  m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function");
+#endif
+
+  m.def("init_custom_all_reduce", &init_custom_all_reduce, "init all reduce class function");
+
+  m.def("all_reduce", &all_reduce, "all reduce function");
+
+  m.def("dispose", &dispose, "del function for python");
+
+  m.def("meta_size", &meta_size, "meta_size function for Signal struct");
+
+  m.def("register_buffer", &register_buffer, "register ipc buffer");
+
+  m.def("register_graph_buffers", &register_graph_buffers, "register_graph_buffers");
+
+  m.def("allocate_shared_buffer_and_handle", &allocate_shared_buffer_and_handle, "allocate_shared_buffer_and_handle");
+
+  m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
+
+  m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
+
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
+
+  // speculative decoding Kernel
+  m.def("speculate_get_padding_offset", &SpeculateGetPaddingOffset, "speculate_get_padding_offset function");
+
+  m.def("speculate_get_seq_lens_output", &SpeculateGetSeqLensOutput, "speculate_get_seq_lens_output function");
+
+  m.def("speculate_get_output_padding_offset",&SpeculateGetOutputPaddingOffset, "speculate_get_output_padding_offset function");
+
+  m.def("speculate_get_token_penalty_multi_scores",&SpecTokenPenaltyMultiScores, "speculate_get_token_penalty_multi_scores function");
+
+  m.def("speculate_set_stop_value_multi_seqs",&SpecGetStopFlagsMultiSeqs, "speculate_set_stop_value_multi_seqs function");
+
+  m.def("speculate_verify",&SpeculateVerify, "speculate_verify function");
+
+  m.def("speculate_update_v3",&SpeculateUpdateV3, "noaux_tc for Deepseekv3 MoE compute function");
+
+  m.def("speculate_set_value_by_flags_and_idx",&SpeculateSetValueByFlagsAndIdx, "speculate_set_value_by_flags_and_idx function");
+
+  m.def("speculate_save_output", &SpeculateSaveWithOutputMsgStatic, "speculate_save_output function");
+
+  m.def("speculate_clear_accept_nums",&SpeculateClearAcceptNums, "speculate_clear_accept_nums function");
+
+  m.def("ngram_match", &NgramMatch, "ngram_match function");
+
+  m.def("draft_model_postprocess",&DraftModelPostprocess, "draft_model_postprocess function");
+
+  m.def("draft_model_preprocess",&DraftModelPreprocess, "draft_model_preprocess function");
+
+  m.def("draft_model_update",&DraftModelUpdate, "draft_model_update function");
+
+  m.def("eagle_get_hidden_states",&EagleGetHiddenStates, "eagle_get_hidden_states function");
+
+  m.def("eagle_get_self_hidden_states", &EagleGetSelfHiddenStates, "eagle_get_self_hidden_states function");
+
+  m.def("mtp_step_paddle",&MTPStepPaddle, "mtp_step_paddle function");
+
+  m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
 }
--- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu
+++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cu
@@ -0,0 +1,165 @@
+// adapted from: https://github.com/vllm-project/vllm/blob/118ff921118cc81061a2af865a1e13840ceb6792/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu
+
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "all_reduce.cuh"
+
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t init_custom_all_reduce(const std::vector<fptr_t>& fake_ipc_ptrs,
+                      paddle::Tensor& rank_data, int64_t rank,
+                      bool full_nvlink) {
+  int world_size = fake_ipc_ptrs.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  paddle::Signal* ipc_ptrs[8];
+  for (int i = 0; i < world_size; i++) {
+    ipc_ptrs[i] = reinterpret_cast<paddle::Signal*>(fake_ipc_ptrs[i]);
+  }
+  return (fptr_t) new paddle::CustomAllreduce(ipc_ptrs, rank_data.data(),
+                                            rank_data.numel(), rank, world_size,
+                                            full_nvlink);
+}
+
+/**
+ * Performs an out-of-place allreduce and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_reduce(fptr_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
+                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  auto stream = inp.stream();
+
+  auto input_size = inp.numel() * 2;
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    cudaMemcpyAsync(reg_buffer, inp.data(), input_size,
+                                  cudaMemcpyDeviceToDevice, stream);
+  } else {
+    reg_buffer = inp.data();
+  }
+  switch (out.dtype()) {
+    case phi::DataType::FLOAT32: {
+      fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
+                           reinterpret_cast<float*>(out.data()),
+                           out.numel());
+      break;
+    }
+    case phi::DataType::FLOAT16: {
+      fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
+                          reinterpret_cast<half*>(out.data()), out.numel());
+      break;
+    }
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
+    case phi::DataType::BFLOAT16: {
+      fa->allreduce<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
+          reinterpret_cast<nv_bfloat16*>(out.data()), out.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allreduce only supports float32, float16 and bfloat16");
+  }
+}
+
+void dispose(fptr_t _fa) {
+  delete reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+}
+
+int64_t meta_size() { return sizeof(paddle::Signal); }
+
+void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_graph_buffer_ipc_meta(fptr_t _fa) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
+}
+
+std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+
+  auto device_index = phi::backends::gpu::GetCurrentDeviceId();
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = paddle::GetCurrentCUDAStream(phi::GPUPlace(device_index))->raw_stream();
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+  CUDACHECK(cudaMalloc((void**)&buffer, size));
+  CUDACHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  CUDACHECK(cudaStreamSynchronize(stream));
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto handle =
+      paddle::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, paddle::DataType::UINT8, paddle::GPUPlace(device_index));
+  CUDACHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+
+fptr_t open_mem_handle(paddle::Tensor& mem_handle) {
+  void* ipc_ptr;
+  CUDACHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  CUDACHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
--- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
+++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh
@@ -0,0 +1,526 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+namespace paddle {
+
+constexpr int kMaxBlocks = 36;
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
+struct Signal {
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+};
+
+struct __align__(16) RankData {
+  const void* __restrict__ ptrs[8];
+};
+
+struct __align__(16) RankSignals {
+  Signal* signals[8];
+};
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Paddle, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) { return a += b; }
+
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
+DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#endif
+}
+
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  return flag;
+}
+
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+  if (threadIdx.x < ngpus) {
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
+    } else {
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
+    }
+  }
+  if constexpr (is_start || need_fence) __syncthreads();
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+}
+
+template <typename P>
+DINLINE P* get_tmp_buf(Signal* sg) {
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    if (world_size_ == 2) {                           \
+      KL(ngpus, cross_device_reduce_1stage);          \
+    } else if (full_nvlink_) {                        \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);        \
+      } else {                                        \
+        KL(ngpus, cross_device_reduce_2stage);        \
+      }                                               \
+    }                                                 \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+}  // namespace paddle
--- a/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h
@@ -136,4 +136,4 @@ struct Epilogue<ElementType, ElementsPerVectorAccess, ElementAccumulator, Epilog
        ElementAccumulator, DefaultScaleMode>;
 };

-} // namespace cutlass_extensions
+} // namespace cutlass_extensions
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -1,11 +1,11 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/Show More
+++ b/Show More