[Doc]Release fastdeploy-xpu 2.0.3 (#3408 )

* fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * update info
[Doc]Release fastdeploy-xpu 2.1.0 (#3407 )
2025-10-04 08:16:42 +08:00 · 2025-08-14 19:19:54 +08:00 · 2025-08-14 19:11:16 +08:00 · 2025-08-14 19:10:07 +08:00 · 2025-08-14 17:41:40 +08:00 · 2025-08-14 17:20:29 +08:00
195 changed files with 11743 additions and 2756 deletions
--- a/.github/workflows/Codestyle-Check.yml
+++ b/.github/workflows/Codestyle-Check.yml
@@ -2,7 +2,9 @@ name: Codestyle-Check

 on:
  pull_request:
-    branches: ["develop"]
+    branches:
+      - develop
+      - 'release/*'

 jobs:
  pre-commit:
@@ -11,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    env:
      PR_ID: ${{ github.event.pull_request.number }}
-      BRANCH: develop
+      BRANCH: ${{ github.event.pull_request.base.ref }}

    steps:
      - name: Cleanup
--- a/.github/workflows/_build_linux.yml
+++ b/.github/workflows/_build_linux.yml
@@ -44,7 +44,7 @@ on:
        value: ${{ jobs.fd-build.outputs.wheel_path }}
 jobs:
  fd-build:
-    runs-on: [self-hosted, GPU-h1z1-4Cards]
+    runs-on: [self-hosted, GPU-Build]
    outputs:
      wheel_path: ${{ steps.set_output.outputs.wheel_path }}
    steps:
@@ -88,10 +88,10 @@ jobs:
        run: |
            set -x
            runner_name="${{ runner.name }}"
-            CARD_ID=$(echo "${runner_name}" | cut -d'-' -f2)
+            CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
            gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)

-            CACHE_DIR=${CACHE_DIR:-${{ github.workspace }}}
+            CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
            echo "CACHE_DIR is set to ${CACHE_DIR}"
            if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
              touch "${CACHE_DIR}/gitconfig"
@@ -103,6 +103,7 @@ jobs:
            -v $(pwd):/workspace -w /workspace \
            -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
            -v "${CACHE_DIR}/.cache:/root/.cache" \
+            -v "${CACHE_DIR}/.ccache:/root/.ccache" \
            -v "${CACHE_DIR}/ConfigDir:/root/.config" \
            -e TZ="Asia/Shanghai" \
            -e "COMPILE_ARCH=${compile_arch}" \
@@ -123,14 +124,12 @@ jobs:
              echo "Date Only: $DATE_ONLY"
              export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
            fi
-            pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
-            pip config set install.trusted-host  pip.baidu.com
-            pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+            python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+            pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

            python -m pip install --upgrade pip
            python -m pip install -r requirements.txt
            python -m pip install wheel
-            python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
            # 编译RDMA
            export ENABLE_FD_RDMA=1
            bash build.sh 1 python false [${COMPILE_ARCH}]
--- a/.github/workflows/_logprob_test_linux.yml
+++ b/.github/workflows/_logprob_test_linux.yml
@@ -0,0 +1,169 @@
+name: Run FastDeploy LogProb Tests
+description: "Run FastDeploy LogProb Tests"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      PADDLETEST_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+        default: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  run_tests_logprob:
+    runs-on: [self-hosted, GPU-h20-1Cards]
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          paddletest_archive_url: ${{ inputs.PADDLETEST_ARCHIVE_URL }}
+        run: |
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            -e "BASE_BRANCH=${BASE_BRANCH}" \
+            ${docker_image} /bin/bash -c '
+            rm -rf /workspace/*
+            '
+            wget -q ${paddletest_archive_url}
+            tar -xf PaddleTest.tar.gz
+            rm -rf PaddleTest.tar.gz
+            cd PaddleTest
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+      - name: logprob test
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+          runner_name="${{ runner.name }}"
+          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
+          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
+
+          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
+          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
+          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
+
+          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+          echo "CACHE_DIR is set to ${CACHE_DIR}"
+          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+            touch "${CACHE_DIR}/gitconfig"
+          fi
+          if [ ! -d "${MODEL_CACHE_DIR}" ]; then
+            echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
+            exit 1
+          fi
+
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
+
+          docker run --ipc=host --pid=host --net=host \
+          -v $(pwd):/workspace \
+          -w /workspace \
+          -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+          -e "FLASK_PORT=${FLASK_PORT}" \
+          -v "${MODEL_CACHE_DIR}:/MODELDATA" \
+          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+          -v "${CACHE_DIR}/.cache:/root/.cache" \
+          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+          -e TZ="Asia/Shanghai" \
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+          python -m pip install ${fastdeploy_wheel_url}
+
+          wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
+          chmod +x ./llm-deploy-linux-amd64
+          ./llm-deploy-linux-amd64 -python python3.10 \
+          -model_name ERNIE-4.5-0.3B-Paddle \
+          -model_path /MODELDATA \
+          --skip install
+
+          cd PaddleTest/framework/ServeTest
+          python3.10 deploy.py > dd.log 2>&1 &
+          sleep 3
+          curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
+              -H "Content-Type: application/json" \
+              -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
+
+          curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+          set +e
+          rm -rf ./baseline_output
+          cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
+          LOGPROB_EXIT_CODE=0
+          python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions  --case ./cases/demo.yaml  --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
+          echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env
+          curl -X POST http://localhost:${FLASK_PORT}/stop
+          sleep 10s
+          cat *result.log
+          exit 0
+          '
+          if [ $? -ne 0 ];then
+            exit 1
+          fi
+
+          if [ -f exit_code.env ]; then
+            cat exit_code.env >> $GITHUB_ENV
+          fi
+      - name: logprob test result
+        if: ${{ env.LOGPROB_EXIT_CODE != 0 }}
+        shell: bash
+        run: |
+          echo "logprob test failed with exit code ${{ env.LOGPROB_EXIT_CODE }}"
+          exit 8
--- a/.github/workflows/_pre_ce_test.yml
+++ b/.github/workflows/_pre_ce_test.yml
@@ -0,0 +1,138 @@
+name: Pre-CE-Test
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+jobs:
+  run_ce_cases:
+    runs-on: [self-hosted, PRE_CE_RUN_2Card]
+    steps:
+      - name: Print current runner name
+        run: |
+          echo "Current runner name: ${{ runner.name }}"
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+
+      - name: Run CI unittest
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+          runner_name="${{ runner.name }}"
+          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
+          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
+
+          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
+          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
+          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
+
+          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+          echo "CACHE_DIR is set to ${CACHE_DIR}"
+          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+            touch "${CACHE_DIR}/gitconfig"
+          fi
+
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
+
+          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+          -v "${CACHE_DIR}/.cache:/root/.cache" \
+          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+          -v "${MODEL_CACHE_DIR}:/ModelData:ro" \
+          -e "MODEL_PATH=/ModelData" \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+          -e "FLASK_PORT=${FLASK_PORT}" \
+          -e "fd_wheel_url=${fd_wheel_url}" \
+          --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install ${fd_wheel_url}
+          bash scripts/run_pre_ce.sh
+          '
--- a/.github/workflows/_unit_test_coverage.yml
+++ b/.github/workflows/_unit_test_coverage.yml
@@ -0,0 +1,274 @@
+name: Run FastDeploy Unit Tests and Coverage
+description: "Run FastDeploy Unit Tests and Coverage"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  run_tests_with_coverage:
+    runs-on: [self-hosted, GPU-h1z1-2Cards]
+    outputs:
+      diff_cov_file_url: ${{ steps.cov_upload.outputs.diff_cov_file_url }}
+      unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
+      diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }}
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "fastdeploy_ci@example.com"
+            git log -n 3 --oneline
+      - name: Run FastDeploy Unit Tests and Coverage
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          BASE_REF: ${{ github.event.pull_request.base.ref }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+          set -x
+          runner_name="${{ runner.name }}"
+          CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
+          DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
+          DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
+
+          FLASK_PORT=$((42068 + DEVICE_PORT * 100))
+          FD_API_PORT=$((42088 + DEVICE_PORT * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
+          FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
+
+          CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+          echo "CACHE_DIR is set to ${CACHE_DIR}"
+          if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+            touch "${CACHE_DIR}/gitconfig"
+          fi
+
+          PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
+          LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
+          echo "==== LOG_FILE is ${LOG_FILE} ===="
+
+          echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
+
+          for port in "${PORTS[@]}"; do
+              PIDS=$(lsof -t -i :$port || true)
+              if [ -n "$PIDS" ]; then
+                  echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
+                  echo "$PIDS" | xargs -r kill -9
+                  echo "Port $port cleared" | tee -a $LOG_FILE
+              else
+                  echo "Port $port is free" | tee -a $LOG_FILE
+              fi
+          done
+
+          echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
+
+          docker run --rm --net=host \
+          --cap-add=SYS_PTRACE --shm-size=64G \
+          -v $(pwd):/workspace -w /workspace \
+          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+          -v "${CACHE_DIR}/.cache:/root/.cache" \
+          -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+          -v "${MODEL_CACHE_DIR}:/ModelData:ro" \
+          -e "MODEL_PATH=/ModelData" \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+          -e "FLASK_PORT=${FLASK_PORT}" \
+          -e TZ="Asia/Shanghai" \
+          -e "fd_wheel_url=${fd_wheel_url}" \
+          -e "BASE_REF=${BASE_REF}" \
+          --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
+
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+          pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+          python -m pip install coverage
+          python -m pip install diff-cover
+          python -m pip install ${fd_wheel_url}
+          if [ -d "test/plugins" ]; then
+              cd test/plugins
+              python setup.py install
+              cd ../..
+          else
+              echo "Warning: test/plugins directory not found, skipping setup.py install"
+          fi
+          export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
+          export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
+          TEST_EXIT_CODE=0
+          bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
+          git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
+          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
+          coverage combine coveragedata/
+          coverage xml -o python_coverage_all.xml
+          COVERAGE_EXIT_CODE=0
+          diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9
+          echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
+          python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml
+          '
+          if [ -f FastDeploy/exit_code.env ]; then
+            cat FastDeploy/exit_code.env >> $GITHUB_ENV
+          fi
+
+      - name: Upload unit resule and diff coverage to bos
+        id: cov_upload
+        shell: bash
+        run: |
+          cd FastDeploy
+          commit_id=${{ github.event.pull_request.head.sha }}
+          pr_num=${{ github.event.pull_request.number }}
+          target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
+          wget  -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
+          push_file=$(realpath bos_tools.py)
+          python -m pip install bce-python-sdk==0.9.29
+          diff_cov_file="diff_coverage.xml"
+          if [ -f ${diff_cov_file} ];then
+            python ${push_file} ${diff_cov_file} ${target_path}/CoverageData
+            target_path_stripped="${target_path#paddle-github-action/}"
+            DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_file}
+            echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_OUTPUT
+            echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_ENV
+          fi
+          diff_cov_result_json="diff_coverage.json"
+          if [ -f ${diff_cov_result_json} ];then
+            python ${push_file} ${diff_cov_result_json} ${target_path}/CoverageData
+            target_path_stripped="${target_path#paddle-github-action/}"
+            DIFF_COV_JSON_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_result_json}
+            echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_OUTPUT
+            echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_ENV
+          fi
+          unittest_result="test/failed_tests.log"
+          if [ -s ${unittest_result} ];then
+            python ${push_file} ${unittest_result} ${target_path}/UnitTestResult
+            target_path_stripped="${target_path#paddle-github-action/}"
+            UNIT_TEST_RESULT_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_result}
+            echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_OUTPUT
+            echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV
+          fi
+      - name: Check Unit Test Success
+        shell: bash
+        run: |
+          cd FastDeploy
+          if [ "$TEST_EXIT_CODE" -eq 8 ]; then
+            filename=$(basename "$unittest_failed_url")
+            if [ -z "${unittest_failed_url}" ]; then
+              echo "No diff unit failed file URL provided."
+            else
+              rm -rf "${filename}"
+              wget -O ${filename} ${unittest_failed_url} || echo "Download unittest file failed, but continuing..."
+            fi
+            echo "Unit tests failed (exit code 8)"
+            if [ -f "${filename}" ];then
+              echo "Failed test cases:"
+              cat "${filename}"
+            fi
+            exit "$TEST_EXIT_CODE"
+          fi
+          echo "All tests passed"
+
+      - name: Verify Code Coverage Threshold (80%)
+        shell: bash
+        run: |
+          cd FastDeploy
+          if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then
+            echo "Coverage generation failed (exit code 9)"
+            filename=$(basename "$diff_cov_result_json_url")
+            if [ -z "${diff_cov_result_json_url}" ]; then
+              echo "No diff cov result file URL provided."
+            else
+              rm -rf "${filename}"
+              wget -O ${filename} ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..."
+            fi
+            if [ -f "${filename}" ];then
+              echo "Failed test cases:"
+              if command -v jq >/dev/null 2>&1; then
+                  jq . "${filename}"
+              else
+                  cat "${filename}"
+              fi
+            fi
+            exit "$COVERAGE_EXIT_CODE"
+          fi
+          echo "coverage passed"
+          exit 0
+
+  diff_coverage_report:
+    needs: run_tests_with_coverage
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: coverage diff file download
+        shell: bash
+        env:
+          diff_cov_file_url: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url }}
+        run: |
+          if [ -z "${diff_cov_file_url}" ]; then
+            echo "No diff coverage file URL provided."
+            exit 0
+          fi
+          wget "${diff_cov_file_url}" -O ./diff_coverage.xml || echo "Download cov file failed, but continuing..."
+      - name: Upload diff coverage report
+        if: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url != null && needs.run_tests_with_coverage.outputs.diff_cov_file_url != '' }}
+        uses: codecov/codecov-action@v5
+        with:
+          files: ./diff_coverage.xml
+          name: python diff coverage
+          verbose: true
--- a/.github/workflows/approve.yml
+++ b/.github/workflows/approve.yml
@@ -0,0 +1,39 @@
+name: Approval
+
+on:
+  pull_request:
+    branches:
+      - develop
+      - 'release/*'
+
+jobs:
+  Approval:
+    name: Approval
+    if: ${{ github.repository_owner == 'PaddlePaddle' }}
+    runs-on: ubuntu-latest
+    env:
+      PR_ID: ${{ github.event.pull_request.number }}
+      BRANCH: ${{ github.event.pull_request.base.ref }}
+    steps:
+      - name: Checkout base repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          fetch-depth: 1000
+
+      - name: Merge PR to test branch
+        run: |
+          git fetch origin pull/${PR_ID}/merge
+          git checkout -b test FETCH_HEAD
+          git log -n 3 --oneline
+          git remote add upstream https://github.com/PaddlePaddle/FastDeploy.git
+          git fetch upstream $BRANCH
+
+      - name: Setup python3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Run approval check script
+        run: |
+          bash scripts/check_approval.sh
--- a/.github/workflows/ci_gcu.yml
+++ b/.github/workflows/ci_gcu.yml
@@ -1,4 +1,4 @@
-name: CI
+name: CI_GCU

 on:
  pull_request:
@@ -8,23 +8,20 @@ on:
  workflow_dispatch:

 concurrency:
-  group: ${{ github.event.pull_request.number }}
+  group: ${{ github.event.pull_request.number }}-gcu-ci
  cancel-in-progress: true

 jobs:
-  build:
-    runs-on: [self-hosted, GPU-L20-4Card]
+  CI_GCU:
+    runs-on: [self-hosted, GCU-S60-8Card]
    steps:
      - name: Print current runner name
        run: |
          echo "Current runner name: ${{ runner.name }}"
-      # Because the system version is lower than 2.23, the checkout cannot be used.
-      # - name: Checkout code
-      #   uses: actions/checkout@v4

      - name: Code Checkout
        env:
-          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-gcu:topsrider3.5.102-ubuntu20-x86_64-gcc84
        run: |
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
@@ -55,35 +52,38 @@ jobs:

      - name: Run CI unittest
        env:
-          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
+          docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-gcu:topsrider3.5.102-ubuntu20-x86_64-gcc84
        run: |
          runner_name="${{ runner.name }}"
          last_char="${runner_name: -1}"

-          if [ "${last_char}" = "1" ]; then
-            gpu_id=2
-            DEVICES="2,3"
+          if [[ "$last_char" =~ [0-3] ]]; then
+            gcu_id="$last_char"
          else
-            gpu_id=0
-            DEVICES="0,1"
+            gcu_id="0"
          fi
-          FD_API_PORT=$((9180 + gpu_id * 100))
-          FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
-          FD_METRICS_PORT=$((9170 + gpu_id * 100))
+          FD_API_PORT=$((9180 + gcu_id * 100))
+          FD_ENGINE_QUEUE_PORT=$((9150 + gcu_id * 100))
+          FD_METRICS_PORT=$((9170 + gcu_id * 100))

          PARENT_DIR=$(dirname "$WORKSPACE")
          echo "PARENT_DIR:$PARENT_DIR"
-          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-          -v "/ssd4/GithubActions/gitconfig:/etc/gitconfig:ro" \
-          -v "/ssd4/GithubActions/ModelData:/ModelData:ro" \
-          -v "/ssd4/GithubActions/CacheDir:/root/.cache" \
-          -v "/ssd4/GithubActions/ConfigDir:/root/.config" \
-          -e "MODEL_PATH=/ModelData" \
+          echo "Install drivers..."
+          cd /work/deps
+          bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
+          cd -
+          docker run --rm --network=host --ipc=host -it --privileged  \
+          -v $(pwd):/workspace -w /workspace \
+          -v "/home:/home" \
+          -v "/work:/work" \
+          -e "MODEL_PATH=/work/models" \
+          -e "http_proxy=$(git config --global --get http.proxy)" \
+          -e "https_proxy=$(git config --global --get https.proxy)" \
          -e "FD_API_PORT=${FD_API_PORT}" \
          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c "
+           ${docker_image} /bin/bash -c "
          git config --global --add safe.directory /workspace/FastDeploy
          cd FastDeploy
-          bash scripts/run_ci.sh
+          bash scripts/run_ci_gcu.sh
          "
--- a/.github/workflows/pr_build_and_test.yml
+++ b/.github/workflows/pr_build_and_test.yml
@@ -21,7 +21,7 @@ jobs:
    with:
      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
-      COMPILE_ARCH: "90"
+      COMPILE_ARCH: "89,90"
      WITH_NIGHTLY_BUILD: "OFF"
      FD_VERSION: "0.0.0"

@@ -33,3 +33,33 @@ jobs:
      - name: Print wheel path
        run: |
          echo "The built wheel is located at: ${{ needs.build.outputs.wheel_path }}"
+
+  unittest_coverage:
+    name: Run FastDeploy Unit Tests and Coverage
+    needs: [clone,build]
+    uses: ./.github/workflows/_unit_test_coverage.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
+
+  logprob_test:
+    name: Run FastDeploy LogProb Tests
+    needs: [build]
+    uses: ./.github/workflows/_logprob_test_linux.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
+
+  pre_ce_test:
+    name: Extracted partial CE model tasks to run in CI.
+    needs: [clone,build]
+    uses: ./.github/workflows/_pre_ce_test.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -361,8 +361,7 @@ async def benchmark(

    if not test_output.success:
        raise ValueError(
-            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}"
+            f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
        )
    else:
        print("Initial test run completed. Starting main benchmark run...")
--- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
+++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
@@ -195,22 +195,25 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_this_time,
-    const int encoder_block_shape_q, const int decoder_block_shape_q,
-    const int group_size, const int block_size,
-    const int decoder_step_token_num) {
+    paddle::Tensor &decoder_batch_ids,          // Inplace
+    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
+    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    const int encoder_block_shape_q,
+    const int decoder_block_shape_q,
+    const int group_size,
+    const int block_size,
+    const int decoder_step_token_num)
+{
  auto stream = seq_lens_encoder.stream();
  int bsz = seq_lens_this_time.shape()[0];
-  auto max_len_tensor =
-      GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
-  GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
-            max_len_tensor, bsz);

-  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
-  // max_enc_dec_len_this_time, max_just_dec_len_this_time,
-  // max_just_dec_merged_len_this_time, max_system_len,
-  // max_just_dec_len_without_system
-  auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
-  auto max_len_cpu_ptr = max_len_cpu.data<int>();
+  paddle::Tensor max_len_tensor_gpu = GetEmptyTensor({max_len_tensor_cpu.shape()[0]}, paddle::DataType::INT32, seq_lens_this_time.place());
+  GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
+            max_len_tensor_gpu, bsz);
+  max_len_tensor_cpu.copy_(max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
+
+  auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
  int max_len_this_time = max_len_cpu_ptr[0];
  int max_enc_len_this_time = max_len_cpu_ptr[1];
  int max_dec_len_this_time = max_len_cpu_ptr[2];
@@ -222,14 +225,11 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(

  paddle::Tensor encoder_batch_ids;
  paddle::Tensor encoder_tile_ids_per_batch;
-  paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
+  paddle::Tensor encoder_num_blocks_x_cpu;  /*cpu*/
  paddle::Tensor kv_batch_ids;
  paddle::Tensor kv_tile_ids_per_batch;
-  paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor decoder_batch_ids;
-  paddle::Tensor decoder_tile_ids_per_batch;
-  paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor max_len_kv_cpu;           /*cpu*/
+  paddle::Tensor kv_num_blocks_x_cpu;       /*cpu*/
+  paddle::Tensor max_len_kv_cpu;            /*cpu*/

  auto max_len_kv =
      GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
@@ -291,92 +291,64 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    kv_num_blocks_x_cpu =
        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
  }
-  if (max_just_dec_len_this_time > 0) {
-    const uint32_t decoder_max_tile_size_per_bs_q =
-        div_up((decoder_step_token_num * group_size), decoder_block_shape_q);

-    decoder_batch_ids =
-        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_tile_ids_per_batch =
-        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
+  if (max_just_dec_len_this_time > 0) {
+    // Clear buffer
+    const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
+    const uint32_t decoder_batch_shape = bsz * decoder_max_tile_size_per_bs_q;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_x_cpu.data<int>(), 0, sizeof(int32_t), stream));
+
    auto decoder_num_blocks_x =
        GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
    split_q_block<<<1, 32, 0, stream>>>(
-        seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
-        decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
-        decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
+        seq_lens_this_time.data<int>(),
+        seq_lens_encoder.data<int>(),
+        decoder_batch_ids.data<int>(),
+        decoder_tile_ids_per_batch.data<int>(),
+        decoder_num_blocks_x.data<int>(),
+        bsz,
+        decoder_block_shape_q,
        group_size);
-    decoder_num_blocks_x_cpu =
-        decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
-  } else {
-    decoder_batch_ids =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_tile_ids_per_batch =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_num_blocks_x_cpu =
-        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
+    decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
  }

-  return {encoder_batch_ids,
-          encoder_tile_ids_per_batch,
-          encoder_num_blocks_x_cpu, /*cpu*/
-          kv_batch_ids,
-          kv_tile_ids_per_batch,
-          kv_num_blocks_x_cpu, /*cpu*/
-          decoder_batch_ids,
-          decoder_tile_ids_per_batch,
-          decoder_num_blocks_x_cpu, /*cpu*/
-          max_len_kv_cpu /*cpu*/,
-          max_len_cpu};
-}
-
-std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
-    const paddle::DataType &seq_lens_encoder_dtype,
-    const paddle::DataType &seq_lens_decoder_dtype,
-    const paddle::DataType &seq_lens_this_time_dtype) {
  return {
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32};
-}
-
-std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
-    const std::vector<int64_t> &seq_lens_encoder_shape,
-    const std::vector<int64_t> &seq_lens_decoder_shape,
-    const std::vector<int64_t> &seq_lens_this_time_shape) {
-  std::vector<int64_t> dynamic_shape = {-1};
-
-  return {dynamic_shape,
-          dynamic_shape,
-          {1},
-          dynamic_shape,
-          dynamic_shape,
-          {1},
-          dynamic_shape,
-          dynamic_shape,
-          {1},
-          {1},
-          {8}};
+    encoder_batch_ids,
+    encoder_tile_ids_per_batch,
+    encoder_num_blocks_x_cpu, /*cpu*/
+    kv_batch_ids,
+    kv_tile_ids_per_batch,
+    kv_num_blocks_x_cpu,      /*cpu*/
+    max_len_kv_cpu,           /*cpu*/
+  };
 }

 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
-    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time"})
-    .Outputs({paddle::Optional("encoder_batch_ids"),
-              paddle::Optional("encoder_tile_ids_per_batch"),
-              paddle::Optional("encoder_num_blocks"),
-              paddle::Optional("kv_batch_ids"),
-              paddle::Optional("kv_tile_ids_per_batch"),
-              paddle::Optional("kv_num_blocks"),
-              paddle::Optional("decoder_batch_ids"),
-              paddle::Optional("decoder_tile_ids_per_batch"),
-              paddle::Optional("decoder_num_blocks"),
-              paddle::Optional("max_len_kv"), "set_max_lengths"})
-    .Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
-            "group_size: int", "block_size: int",
-            "decoder_step_token_num: int"})
-    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
-    .SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GetBlockShapeAndSplitKVBlockInferDtype));
+    .Inputs({
+      "seq_lens_encoder",
+      "seq_lens_decoder",
+      "seq_lens_this_time",
+      "decoder_batch_ids",
+      "decoder_tile_ids_per_batch",
+      "decoder_num_blocks_x_cpu",
+      "max_len_tensor_cpu"
+    })
+    .Outputs({
+      paddle::Optional("encoder_batch_ids"),
+      paddle::Optional("encoder_tile_ids_per_batch"),
+      paddle::Optional("encoder_num_blocks_x_cpu"),
+      paddle::Optional("kv_batch_ids"),
+      paddle::Optional("kv_tile_ids_per_batch"),
+      paddle::Optional("kv_num_blocks_x_cpu"),
+      "max_len_kv_cpu"
+    })
+    .Attrs({
+      "encoder_block_shape_q: int",
+      "decoder_block_shape_q: int",
+      "group_size: int",
+      "block_size: int",
+      "decoder_step_token_num: int"
+    })
+    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock));
--- a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
+++ b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
@@ -586,9 +586,9 @@ __global__ void append_cache_kv_c4(
 #pragma unroll
  for (uint32_t i = wid * 32 + tid; i < HEAD_DIM; i += 128) {
    cache_k_scale_smem[i] = cache_k_scale_now[i];
-    cache_k_zero_point_smem[i] = cache_k_zp_now[i] - static_cast<T>(136.f);
+    cache_k_zero_point_smem[i] = cache_k_zp_now[i] + static_cast<T>(136.f);
    cache_v_scale_smem[i] = cache_v_scale_now[i];
-    cache_v_zero_point_smem[i] = cache_v_zp_now[i] - static_cast<T>(136.f);
+    cache_v_zero_point_smem[i] = cache_v_zp_now[i] + static_cast<T>(136.f);
  }

  smem_t k_smem(smem);
@@ -640,25 +640,25 @@ __global__ void append_cache_kv_c4(
        convert_int4(frag_dq_T + 8, k_frag[2 * i + 1]);

        if (row_idx < end_idx) {
-          k_tile_ptr0[0] = frag_dq_T[0] * cache_k_scale_smem[col_idx] + cache_k_zero_point_smem[col_idx];
-          k_tile_ptr0[1] = frag_dq_T[1] * cache_k_scale_smem[col_idx + 1] + cache_k_zero_point_smem[col_idx + 1];
-          k_tile_ptr0[8] = frag_dq_T[2] * cache_k_scale_smem[col_idx + 8] + cache_k_zero_point_smem[col_idx + 8];
-          k_tile_ptr0[9] = frag_dq_T[3] * cache_k_scale_smem[col_idx + 9] + cache_k_zero_point_smem[col_idx + 9];
-          k_tile_ptr0[16] = frag_dq_T[8] * cache_k_scale_smem[col_idx + 16] + cache_k_zero_point_smem[col_idx + 16];
-          k_tile_ptr0[17] = frag_dq_T[9] * cache_k_scale_smem[col_idx + 17] + cache_k_zero_point_smem[col_idx + 17];
-          k_tile_ptr0[24] = frag_dq_T[10] * cache_k_scale_smem[col_idx + 24] + cache_k_zero_point_smem[col_idx + 24];
-          k_tile_ptr0[25] = frag_dq_T[11] * cache_k_scale_smem[col_idx + 25] + cache_k_zero_point_smem[col_idx + 25];
+          k_tile_ptr0[0] = (frag_dq_T[0] - cache_k_zero_point_smem[col_idx]) * cache_k_scale_smem[col_idx];
+          k_tile_ptr0[1] = (frag_dq_T[1] - cache_k_zero_point_smem[col_idx + 1]) * cache_k_scale_smem[col_idx + 1];
+          k_tile_ptr0[8] = (frag_dq_T[2] - cache_k_zero_point_smem[col_idx + 8]) * cache_k_scale_smem[col_idx + 8];
+          k_tile_ptr0[9] = (frag_dq_T[3] - cache_k_zero_point_smem[col_idx + 9]) * cache_k_scale_smem[col_idx + 9];
+          k_tile_ptr0[16] = (frag_dq_T[8] - cache_k_zero_point_smem[col_idx + 16]) * cache_k_scale_smem[col_idx + 16];
+          k_tile_ptr0[17] = (frag_dq_T[9] - cache_k_zero_point_smem[col_idx + 17]) * cache_k_scale_smem[col_idx + 17];
+          k_tile_ptr0[24] = (frag_dq_T[10] - cache_k_zero_point_smem[col_idx + 24]) * cache_k_scale_smem[col_idx + 24];
+          k_tile_ptr0[25] = (frag_dq_T[11] - cache_k_zero_point_smem[col_idx + 25]) * cache_k_scale_smem[col_idx + 25];
        }

        if (row_idx + 8 < end_idx) {
-          k_tile_ptr1[0] = frag_dq_T[4] * cache_k_scale_smem[col_idx] + cache_k_zero_point_smem[col_idx];
-          k_tile_ptr1[1] = frag_dq_T[5] * cache_k_scale_smem[col_idx + 1] + cache_k_zero_point_smem[col_idx + 1];
-          k_tile_ptr1[8] = frag_dq_T[6] * cache_k_scale_smem[col_idx + 8] + cache_k_zero_point_smem[col_idx + 8];
-          k_tile_ptr1[9] = frag_dq_T[7] * cache_k_scale_smem[col_idx + 9] + cache_k_zero_point_smem[col_idx + 9];
-          k_tile_ptr1[16] = frag_dq_T[12] * cache_k_scale_smem[col_idx + 16] + cache_k_zero_point_smem[col_idx + 16];
-          k_tile_ptr1[17] = frag_dq_T[13] * cache_k_scale_smem[col_idx + 17] + cache_k_zero_point_smem[col_idx + 17];
-          k_tile_ptr1[24] = frag_dq_T[14] * cache_k_scale_smem[col_idx + 24] + cache_k_zero_point_smem[col_idx + 24];
-          k_tile_ptr1[25] = frag_dq_T[15] * cache_k_scale_smem[col_idx + 25] + cache_k_zero_point_smem[col_idx + 25];
+          k_tile_ptr1[0] = (frag_dq_T[4] - cache_k_zero_point_smem[col_idx]) * cache_k_scale_smem[col_idx];
+          k_tile_ptr1[1] = (frag_dq_T[5] - cache_k_zero_point_smem[col_idx + 1]) * cache_k_scale_smem[col_idx + 1];
+          k_tile_ptr1[8] = (frag_dq_T[6] - cache_k_zero_point_smem[col_idx + 8]) * cache_k_scale_smem[col_idx + 8];
+          k_tile_ptr1[9] = (frag_dq_T[7] - cache_k_zero_point_smem[col_idx + 9]) * cache_k_scale_smem[col_idx + 9];
+          k_tile_ptr1[16] = (frag_dq_T[12] - cache_k_zero_point_smem[col_idx + 16]) * cache_k_scale_smem[col_idx + 16];
+          k_tile_ptr1[17] = (frag_dq_T[13] - cache_k_zero_point_smem[col_idx + 17]) * cache_k_scale_smem[col_idx + 17];
+          k_tile_ptr1[24] = (frag_dq_T[14] - cache_k_zero_point_smem[col_idx + 24]) * cache_k_scale_smem[col_idx + 24];
+          k_tile_ptr1[25] = (frag_dq_T[15] - cache_k_zero_point_smem[col_idx + 25]) * cache_k_scale_smem[col_idx + 25];
        }
        col_idx += 32;
      }
@@ -711,36 +711,36 @@ __global__ void append_cache_kv_c4(
        convert_int4(frag_dq_T, v_frag[2 * i]);
        convert_int4(frag_dq_T + 8, v_frag[2 * i + 1]);
        if (kv_idx < end_idx) {
-          v_tile_ptr0[0] = frag_dq_T[0] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[0] = frag_dq_T[4] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[0] = (frag_dq_T[0] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[0] = (frag_dq_T[4] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 1 < end_idx) {
-          v_tile_ptr0[kv_t_stride] = frag_dq_T[1] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[kv_t_stride] = frag_dq_T[5] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[kv_t_stride] = (frag_dq_T[1] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[kv_t_stride] = (frag_dq_T[5] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 8 < end_idx) {
-          v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[8 * kv_t_stride] = frag_dq_T[6] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[8 * kv_t_stride] = (frag_dq_T[2] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[8 * kv_t_stride] = (frag_dq_T[6] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 9 < end_idx) {
-          v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[9 * kv_t_stride] = frag_dq_T[7] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[9 * kv_t_stride] = (frag_dq_T[3] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[9 * kv_t_stride] = (frag_dq_T[7] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 16 < end_idx) {
-          v_tile_ptr0[16 * kv_t_stride] = frag_dq_T[8] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[16 * kv_t_stride] = frag_dq_T[12] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[16 * kv_t_stride] = (frag_dq_T[8] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[16 * kv_t_stride] = (frag_dq_T[12] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 17 < end_idx) {
-          v_tile_ptr0[17 * kv_t_stride] = frag_dq_T[9] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[17 * kv_t_stride] = frag_dq_T[13] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[17 * kv_t_stride] = (frag_dq_T[9] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[17 * kv_t_stride] = (frag_dq_T[13] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 24 < end_idx) {
-          v_tile_ptr0[24 * kv_t_stride] = frag_dq_T[10] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[24 * kv_t_stride] = frag_dq_T[14] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[24 * kv_t_stride] = (frag_dq_T[10] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[24 * kv_t_stride] = (frag_dq_T[14] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        if (kv_idx + 25 < end_idx) {
-          v_tile_ptr0[25 * kv_t_stride] = frag_dq_T[11] * cache_v_scale_smem[dim_idx] + cache_v_zero_point_smem[dim_idx];
-          v_tile_ptr1[25 * kv_t_stride] = frag_dq_T[15] * cache_v_scale_smem[dim_idx + 8] + cache_v_zero_point_smem[dim_idx + 8];
+          v_tile_ptr0[25 * kv_t_stride] = (frag_dq_T[11] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
+          v_tile_ptr1[25 * kv_t_stride] = (frag_dq_T[15] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
        }
        kv_idx += 32;
      }
@@ -956,6 +956,30 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
        rotary_embs.dims()[2],
        head_dim,
        stream);
+
+  if (token_num < kv_token_num) {
+    AppendCacheKV<data_t, 128, 64>(
+      key_cache,
+      value_cache,
+      cache_k_dequant_scales.get(),
+      cache_v_dequant_scales.get(),
+      cache_k_zp.get(),
+      cache_v_zp.get(),
+      seq_lens_this_time,
+      seq_lens_decoder,
+      cu_seqlens_k,
+      block_tables,
+      cache_batch_ids,
+      cache_tile_ids,
+      cache_num_blocks,
+      max_blocks_per_seq,
+      kv_num_heads,
+      cache_quant_type,
+      &k,
+      &v,
+      stream
+    );
+  }
  // write cache
  if (cache_quant_type == "none") {
    CascadeAppendWriteCacheKVQKV<data_t>(
@@ -1038,30 +1062,6 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
        }
      }
  }
-
-  if (token_num < kv_token_num) {
-    AppendCacheKV<data_t, 128, 64>(
-      key_cache,
-      value_cache,
-      cache_k_dequant_scales.get(),
-      cache_v_dequant_scales.get(),
-      cache_k_zp.get(),
-      cache_v_zp.get(),
-      seq_lens_this_time,
-      seq_lens_decoder,
-      cu_seqlens_k,
-      block_tables,
-      cache_batch_ids,
-      cache_tile_ids,
-      cache_num_blocks,
-      max_blocks_per_seq,
-      kv_num_heads,
-      cache_quant_type,
-      &k,
-      &v,
-      stream
-    );
-  }
  return {q, k, v, qkv_out};
 }

--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -235,8 +235,14 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
    const paddle::Tensor &seq_lens_encoder,
    const paddle::Tensor &seq_lens_decoder,
    const paddle::Tensor &seq_lens_this_time,
-    const int encoder_block_shape_q, const int decoder_block_shape_q,
-    const int group_size, const int block_size,
+    paddle::Tensor &decoder_batch_ids,          // Inplace
+    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
+    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    const int encoder_block_shape_q,
+    const int decoder_block_shape_q,
+    const int group_size,
+    const int block_size,
    const int decoder_step_token_num);

 std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
@@ -266,13 +272,12 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
                       const paddle::Tensor &seq_lens,
                       const paddle::Tensor &end_ids,
                       const paddle::Tensor &next_tokens,
+                       const paddle::Tensor &pre_ids,
+                       const paddle::Tensor &step_idx,
+                       const paddle::Tensor &stop_seqs,
+                       const paddle::Tensor &stop_seqs_len,
                       const bool beam_search);

-void GetStopFlagsMultiSeqs(
-    const paddle::Tensor &topk_ids, const paddle::Tensor &pre_ids,
-    const paddle::Tensor &step_idx, const paddle::Tensor &stop_flags,
-    const paddle::Tensor &seq_lens, const paddle::Tensor &stop_seqs,
-    const paddle::Tensor &stop_seqs_len, const paddle::Tensor &end_ids);

 void UpdateInputes(const paddle::Tensor &stop_flags,
                   const paddle::Tensor &not_need_stop, // only on cpu
@@ -954,12 +959,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("set_stop_value_multi_ends", &GetStopFlagsMulti,
        "update_inputs function");

-  /**
-   * stop_generation_multi_stop_seqs.cu
-   * set_stop_value_multi_seqs
-   */
-  m.def("set_stop_value_multi_seqs", &GetStopFlagsMultiSeqs,
-        "update_inputs function");

  /**
   * update_inputs.cu
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h
@@ -133,10 +133,18 @@ public:
 template <typename TypeA, typename Arch>
 struct LayoutDetailsB<TypeA, uint2b_t, Arch, typename platform::enable_if<Arch::kMinComputeCapability >= 75>::type>
 {
-    static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits<TypeA>::value;
-    using Layout = layout::RowMajor;
-    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<TypeA>::value;
-    using Operator = cutlass::arch::OpMultiplyAdd;
+    static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits<TypeA>::value; // 64
+
+private:
+    static constexpr int ElementsPerCacheLine = 128 * 8 / sizeof_bits<uint2b_t>::value;
+    static constexpr int ColumnsInterleaved = ElementsPerCacheLine / ThreadblockK; // 8
+
+public:
+    // using Layout = layout::ColumnMajor;
+    // static constexpr int ElementsPerAccess = 16; // at least 4-bytes
+    using Layout = layout::ColumnMajorTileInterleave<ThreadblockK, ColumnsInterleaved>;
+    static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits<uint2b_t>::value; // 64
+    using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA;
 };

 template <typename TypeA, typename Arch>
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma.h
@@ -18,14 +18,12 @@

 #include "cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
 #include "cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
+#include "cutlass_extensions/gemm/threadblock/default_wint2x_mma.h"
 #include "cutlass_extensions/gemm/threadblock/default_mma_bf16.h"

-namespace cutlass
-{
-namespace gemm
-{
-namespace threadblock
-{
+namespace cutlass {
+namespace gemm {
+namespace threadblock {

 ////////////////////////////////////////////////////////////////////////////////

@@ -378,38 +376,23 @@ template <
 struct DefaultMma<cutlass::half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB, ElementAccumulator,
    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
 {
-    static cutlass::arch::CacheOperation::Kind const CacheOpA =
-        ((sizeof_bits<half_t>::value * kAlignmentA) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
-
-    static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<half_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
+private:
+    using Mma = DefaultWint2xMma<half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
+        ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;

+public:
    // Define the MmaCore components
-    using MmaCore =
-        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, half_t,
-            LayoutA, half_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 3, Operator,
-            false, CacheOpA, CacheOpB>;
+    using MmaCore = typename Mma::MmaCore;

    // Define iterators over tiles from the A operand
-    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-    using AccessTypeA = cutlass::Array<half_t, kAlignmentA>;
-    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, half_t, LayoutA, 1, ThreadMapA,
-        AccessTypeA>;
+    using IteratorA = typename Mma::IteratorA;

    // Define iterators over tiles from the B operand
-    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
-    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB,
-        AccessTypeB>;
+    using IteratorB = typename Mma::IteratorB;

    // Define the threadblock-scoped multistage matrix multiply
-    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<typename MmaCore::Shape, IteratorA,
-        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, 2>;
+    using ThreadblockMma = typename Mma::ThreadblockMma;
 };

 template <
@@ -441,38 +424,23 @@ struct DefaultMma<half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
    false, SharedMemoryClear>
 {
-    static cutlass::arch::CacheOperation::Kind const CacheOpA =
-        ((sizeof_bits<half_t>::value * kAlignmentA) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
-
-    static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<half_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
+private:
+    using Mma = DefaultWint2xMma<half_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
+        ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;

+public:
    // Define the MmaCore components
-    using MmaCore =
-        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, half_t,
-            LayoutA, half_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, kStages, Operator,
-            false, CacheOpA, CacheOpB>;
+    using MmaCore = typename Mma::MmaCore;

    // Define iterators over tiles from the A operand
-    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-    using AccessTypeA = cutlass::Array<half_t, kAlignmentA>;
-    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, half_t, LayoutA, 1, ThreadMapA,
-        AccessTypeA>;
+    using IteratorA = typename Mma::IteratorA;

    // Define iterators over tiles from the B operand
-    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<half_t, kAlignmentB>;
-    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, half_t, LayoutB, 0, ThreadMapB,
-        AccessTypeB>;
+    using IteratorB = typename Mma::IteratorB;

    // Define the threadblock-scoped multistage matrix multiply
-    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<typename MmaCore::Shape, IteratorA,
-        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, kStages, SharedMemoryClear>;
+    using ThreadblockMma = typename Mma::ThreadblockMma;
 };

 } // namespace threadblock
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_bf16.h
@@ -19,7 +19,7 @@
 #include "cutlass/gemm/threadblock/default_mma.h"
 #include "cutlass_extensions/gemm/threadblock/default_dq_mma_multistage.h"
 #include "cutlass_extensions/gemm/threadblock/default_dq_mma_pipelined.h"
-#include "cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h"
+#include "cutlass_extensions/gemm/threadblock/default_wint2x_mma.h"

 namespace cutlass {
 namespace gemm {
@@ -379,38 +379,23 @@ template <
 struct DefaultMma<cutlass::bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB, ElementAccumulator,
    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2, Operator>
 {
-    static cutlass::arch::CacheOperation::Kind const CacheOpA =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentA) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
-
-    static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
+private:
+    using Mma = DefaultWint2xMma<bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
+        ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, 2, Operator>;

+public:
    // Define the MmaCore components
-    using MmaCore =
-        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, bfloat16_t,
-            LayoutA, bfloat16_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 3, Operator,
-            false, CacheOpA, CacheOpB>;
+    using MmaCore = typename Mma::MmaCore;

    // Define iterators over tiles from the A operand
-    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-    using AccessTypeA = cutlass::Array<bfloat16_t, kAlignmentA>;
-    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, bfloat16_t, LayoutA, 1, ThreadMapA,
-        AccessTypeA>;
+    using IteratorA = typename Mma::IteratorA;

    // Define iterators over tiles from the B operand
-    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
-    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB,
-        AccessTypeB>;
+    using IteratorB = typename Mma::IteratorB;

    // Define the threadblock-scoped multistage matrix multiply
-    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<typename MmaCore::Shape, IteratorA,
-        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, 2>;
+    using ThreadblockMma = typename Mma::ThreadblockMma;
 };

 template <
@@ -442,38 +427,23 @@ struct DefaultMma<bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmen
    layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape, InstructionShape, kStages, Operator,
    false, SharedMemoryClear>
 {
-    static cutlass::arch::CacheOperation::Kind const CacheOpA =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentA) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
-
-    static cutlass::arch::CacheOperation::Kind const CacheOpB =
-        ((sizeof_bits<bfloat16_t>::value * kAlignmentB) == 128) ? cutlass::arch::CacheOperation::Global
-            : cutlass::arch::CacheOperation::Always;
+private:
+    using Mma = DefaultWint2xMma<bfloat16_t, LayoutA, kAlignmentA, uint2b_t, LayoutB, kAlignmentB,
+        ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag, ThreadblockShape,
+        WarpShape, InstructionShape, kStages, Operator, SharedMemoryClear>;

+public:
    // Define the MmaCore components
-    using MmaCore =
-        typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape, bfloat16_t,
-            LayoutA, bfloat16_t, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, kStages, Operator,
-            false, CacheOpA, CacheOpB>;
+    using MmaCore = typename Mma::MmaCore;

    // Define iterators over tiles from the A operand
-    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
-    using AccessTypeA = cutlass::Array<bfloat16_t, kAlignmentA>;
-    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, bfloat16_t, LayoutA, 1, ThreadMapA,
-        AccessTypeA>;
+    using IteratorA = typename Mma::IteratorA;

    // Define iterators over tiles from the B operand
-    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
-    using AccessTypeB = cutlass::Array<bfloat16_t, kAlignmentB>;
-    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
-        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>, bfloat16_t, LayoutB, 0, ThreadMapB,
-        AccessTypeB>;
+    using IteratorB = typename Mma::IteratorB;

    // Define the threadblock-scoped multistage matrix multiply
-    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<typename MmaCore::Shape, IteratorA,
-        typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
-        MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy, kStages, SharedMemoryClear>;
+    using ThreadblockMma = typename Mma::ThreadblockMma;
 };

 } // namespace threadblock
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_core.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_mma_core.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: uint2b_t, column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, uint2b_t, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = uint2b_t;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN,
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access of B
+  static constexpr int kMaxThreadsForB =
+      (Shape::kK * Shape::kN * sizeof_bits<ElementB>::value) / kAccessSizeInBits;
+  static constexpr int kThreadsForB =
+      kMaxThreadsForB > kThreads ? kThreads : kMaxThreadsForB;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreadsForB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/default_wint2x_mma.h
@@ -0,0 +1,246 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass_extensions/arch/mma.h"
+#include "cutlass_extensions/gemm/threadblock/default_dq_mma.h"
+#include "cutlass_extensions/gemm/threadblock/default_mma_core.h"
+#include "cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h"
+#include "cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ThreadblockShape, typename ElementT, int GroupSize>
+struct DefaultQuantParamsIterators {
+private:
+    static constexpr int kAlignment = 128 / sizeof_bits<ElementT>::value;
+    static_assert((ThreadblockShape::kN % kAlignment) == 0, "");
+
+    static constexpr int kRows =
+        (GroupSize == -1) ? 1 : (ThreadblockShape::kK + GroupSize - 1) / GroupSize;
+    static constexpr int kColumns = ThreadblockShape::kN;
+
+    using IteratorThreadMap = transform::PitchLinearStripminedThreadMap<
+        layout::PitchLinearShape<kColumns, kRows>,
+        kColumns / kAlignment, kAlignment>;
+
+public:
+    using Iterator = cutlass::transform::threadblock::PredicatedTileIterator<
+        MatrixShape<kRows, kColumns>, ElementT, layout::RowMajor, 0,
+        IteratorThreadMap, kAlignment>;
+    using SmemIterator = Iterator;
+};
+
+template <typename ThreadblockShape, int GroupSize>
+struct DefaultQuantParamsIterators<ThreadblockShape, uint4b_t, GroupSize> {
+private:
+    static constexpr int kAlignment = 32 / sizeof_bits<uint4b_t>::value;
+    static_assert((ThreadblockShape::kN % kAlignment) == 0, "");
+
+    static constexpr int kRows =
+        (GroupSize == -1) ? 1 : (ThreadblockShape::kK + 2 * GroupSize - 1) / (2 * GroupSize);
+    static constexpr int kColumns =
+        (GroupSize == -1) ? ThreadblockShape::kN : ThreadblockShape::kN * 2;
+
+    using IteratorThreadMap = transform::PitchLinearStripminedThreadMap<
+        layout::PitchLinearShape<kColumns, kRows>,
+        kColumns / kAlignment, kAlignment>;
+
+public:
+    using AccessType = cutlass::Array<uint4b_t, kAlignment>;
+    using Iterator = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        MatrixShape<kRows, kColumns>, uint4b_t, layout::RowMajor,
+        0, IteratorThreadMap, AccessType>;
+
+    using SmemIterator = Iterator;
+};
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
+struct DefaultWint2xMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Type for element A
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Type for element B
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Stages in GEMM
+    int kStages,
+    /// Operator performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultWint2xMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator,
+    layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+    kStages, Operator, SharedMemoryClear>
+{
+public:
+    static_assert(platform::is_same<ElementA, half_t>::value || platform::is_same<ElementA, bfloat16_t>::value,
+        "Element A must be fp16 or bf16");
+
+    static_assert(platform::is_same<ElementB, uint2b_t>::value,
+        "Element B must be uint2b_t");
+
+    static_assert(platform::is_same<Operator, arch::OpMultiplyAddDequantizeInterleavedBToA>::value,
+        "Mma multistage must dequantize after ldsm");
+
+    using ElementSuperScale = ElementA;
+    using ElementLocalScale = uint4b_t;
+    using ElementCodeScaleZp = float;
+
+    static constexpr int kGroupSize = 64;
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpA = ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    static cutlass::arch::CacheOperation::Kind const CacheOpB = ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+        ? cutlass::arch::CacheOperation::Global
+        : cutlass::arch::CacheOperation::Always;
+
+    // Define the MmaCore components
+    // Mma core does not depend on stages, so pass in at least 3 here to mma multistage pieces are created
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<ThreadblockShape, WarpShape, InstructionShape,
+        ElementA, LayoutA, ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+        std::max(kStages, 3), Operator, false, CacheOpA, CacheOpB>;
+
+    // Define iterators over tiles from the A operand
+    using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+    using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+    using IteratorA = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>, ElementA, LayoutA, 1, ThreadMapA,
+        AccessTypeA>;
+
+private:
+    static constexpr int kColumnsInterleaved = LayoutB::kColumnsInterleaved;
+    static constexpr int kRowsPerTile = LayoutB::kRowsPerTile;
+    static_assert(!(MmaCore::Shape::kN % kColumnsInterleaved), "ThreadblockShape must be disivle by kColumnsInterleaved");
+    static_assert(kRowsPerTile == MmaCore::Shape::kK, "");
+
+    using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+    using WarpArrangement = typename ThreadMapB::Detail::WarpThreadArrangement;
+    static_assert(!(WarpArrangement::kStrided % kColumnsInterleaved), "");
+
+    using IteratorShapeB = MatrixShape<
+        MmaCore::Shape::kK * kColumnsInterleaved, MmaCore::Shape::kN / kColumnsInterleaved>;
+    using InterleavedThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+        layout::PitchLinearShape<IteratorShapeB::kRow, IteratorShapeB::kColumn>,
+        ThreadMapB::kThreads,
+        layout::PitchLinearShape<WarpArrangement::kContiguous * kColumnsInterleaved,
+            WarpArrangement::kStrided / kColumnsInterleaved>,
+        MmaCore::kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+public:
+    // Define iterators over tiles from the B operand
+    using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+    using IteratorB = cutlass::transform::threadblock::PredicatedTileAccessIterator<
+        IteratorShapeB, ElementB, layout::ColumnMajor, 0, InterleavedThreadMapB,
+        AccessTypeB>;
+
+private:
+    // Define iterators over tiles from extra quant params for B operand
+    using IteratorSuperScale = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementSuperScale, -1>::Iterator;
+    using SmemIteratorSuperScale = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementSuperScale, -1>::SmemIterator;
+
+    using IteratorLocalScale = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementLocalScale, kGroupSize>::Iterator;
+    using SmemIteratorLocalScale = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementLocalScale, kGroupSize>::SmemIterator;
+
+    using IteratorCodeScaleZp = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementCodeScaleZp, -1>::Iterator;
+    using SmemIteratorCodeScaleZp = typename DefaultQuantParamsIterators<
+        ThreadblockShape, ElementCodeScaleZp, -1>::Iterator;
+
+public:
+    using QuantParamsAccessor = Wint2ParamsAccessor<
+        ElementA, ThreadblockShape, IteratorSuperScale, SmemIteratorSuperScale,
+        IteratorLocalScale, SmemIteratorLocalScale,
+        IteratorCodeScaleZp, SmemIteratorCodeScaleZp, kStages, kGroupSize>;
+
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::Wint2xMmaMultistage<
+        typename MmaCore::Shape,
+        IteratorA, typename MmaCore::SmemIteratorA, MmaCore::kCacheOpA,
+        IteratorB, typename MmaCore::SmemIteratorB, MmaCore::kCacheOpB,
+        ElementAccumulator, layout::RowMajor, typename MmaCore::MmaPolicy,
+        kStages, QuantParamsAccessor, SharedMemoryClear>;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
@@ -63,8 +63,8 @@ template <
    typename Policy_,
    /// Number of stages,
    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
+    /// Size of extra quantized params
+    typename QuantParamsShape>
 class Wint2xMmaBase {
 public:
  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
@@ -93,6 +93,14 @@ public:
  static int const kWarpGemmIterations =
      (WarpGemm::kK / Operator::Policy::MmaShape::kK);

+  /// Number of warp-level GEMM oeprations per load for B
+  static constexpr int kWarpGemmIterationsPerLoadForB =
+      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
+  static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
+
+  static constexpr int kWarpLoadIterationsForB =
+      kWarpGemmIterations / kWarpGemmIterationsPerLoadForB;
+
  /// Number of stages
  static int const kStages = Stages;

@@ -104,8 +112,6 @@ public:
  using TensorRefB =
      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;

-  // using TensorRefZippedB = TensorRef<uint8_t, typename Operator::LayoutB>;
-
  static_assert(kWarpGemmIterations > 1,
                "The pipelined structure requires at least two warp-level "
                "GEMM operations.");
@@ -130,20 +136,11 @@ public:
                    Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;

    /// Shape of the B matrix operand in shared memory
-    using ShapeB = MatrixShape<Shape::kK + Policy::SmemPaddingB::kRow,
+    using ShapeB = MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
                               Shape::kN + Policy::SmemPaddingB::kColumn>;

-    // w uint8; local_scale uint8;
-    constexpr static int kZippedRowsPerStages =
-	Shape::kK / 4 + (Shape::kK + 127) / 128;
-
-    // code_scale float; code_zp float; super_scale ElementB
-    constexpr static int kColumnWiseParamsRows = 2 * sizeof(float) +
-        sizeof_bits<typename Operator::ElementB>::value / 8;
-
-    using ZippedShapeB = MatrixShape<kColumnWiseParamsRows + kZippedRowsPerStages * kStages, Shape::kN>;
-
-    using NopaddingShapeB = MatrixShape<Shape::kK, Shape::kN>;
+    /// Shape of all quant params in shared memory
+    using QuantParamsShapeB = QuantParamsShape;

  public:
    //
@@ -156,12 +153,8 @@ public:
    /// Buffer for B operand
    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;

-    /// Buffer for quanted B operand
-    AlignedBuffer<uint8_t, ZippedShapeB::kCount> operand_zipped_B;
-
-    /// Buffer for unzip B operand
-    AlignedBuffer<typename Operator::ElementB, NopaddingShapeB::kCount>
-        operand_unzip_B;
+    /// Buffer for extra quant params of B operand
+    AlignedBuffer<uint8_t, QuantParamsShapeB::kCount> operand_quant_params_B;

  public:
    //
@@ -191,14 +184,6 @@ public:
    TensorRefB operand_B_ref() {
      return TensorRefB{operand_B.data(), LayoutB()};
    }
-
-    CUTLASS_HOST_DEVICE
-    uint8_t *operand_zipped_B_ptr() { return operand_zipped_B.data(); }
-
-    CUTLASS_HOST_DEVICE
-    typename Operator::ElementB *operand_unzip_B_ptr() {
-      return operand_unzip_B.data();
-    }
  };

 protected:
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -45,7 +45,8 @@

 #include "cutlass_extensions/arch/memory_copy_sm80.h"
 #include "cutlass_extensions/gemm/threadblock/wint2x_mma_base.h"
-#include "cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h"
+#include "cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h"
+#include "cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////

@@ -86,15 +87,15 @@ template <
    typename Policy_,
    /// Number of stages,
    int Stages,
+    /// Accessor for extra quantized params
+    typename QuantParamsAccessor_,
    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
 class Wint2xMmaMultistage :
-  public Wint2xMmaBase<Shape_, Policy_, Stages> {
+  public Wint2xMmaBase<Shape_, Policy_, Stages, typename QuantParamsAccessor_::QuantParamsShape> {
 public:
  ///< Base class
-  using Base = Wint2xMmaBase<Shape_, Policy_, Stages>;
+  using Base = Wint2xMmaBase<Shape_, Policy_, Stages, typename QuantParamsAccessor_::QuantParamsShape>;
  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
  using Shape = Shape_;
  ///< Iterates over tiles of A operand in global memory
@@ -107,8 +108,11 @@ public:
  using LayoutC = LayoutC_;
  ///< Policy describing tuning details
  using Policy = Policy_;
+  /// Accessor for extra quantized params
+  using QuantParamsAccessor = QuantParamsAccessor_;
+  using QuantArguments = typename QuantParamsAccessor::Arguments;

-  using ZippedShapeB = typename Base::SharedStorage::ZippedShapeB;
+  static constexpr int kInterleave = IteratorB::Shape::kRow / Shape::kK;

  using SmemIteratorA = SmemIteratorA_;
  using SmemIteratorB = SmemIteratorB_;
@@ -129,6 +133,18 @@ public:
  /// Minimum architecture is Sm80 to support cp.async
  using ArchTag = arch::Sm80;

+  //using LayoutScale = typename QuantParamsAccessor::IteratorSuperScale::Layout;
+  using LayoutScale = layout::RowMajor;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+  using WarpDequantizer =
+      warp::MmaTensorOpWin2xDequantizer<Operator,
+                                        typename Base::WarpGemm,
+                                        Operand::kB,
+                                        typename WarpTransformedFragmentB::Element,
+                                        LayoutScale,
+                                        QuantParamsAccessor::kGroupSize>;
+  static_assert(sizeof(WarpDequantizer) > 0, "WarpDequantizer template instantiation failed");
+
  /// Complex transform on A operand
  static ComplexTransform const kTransformA = Operator::kTransformA;

@@ -174,18 +190,37 @@ public:
    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;

+    using FragmentSuperScale = typename WarpDequantizer::FragmentSuperScale;
+    using FragmentCodeScaleZp = typename WarpDequantizer::FragmentCodeScaleZp;
+    using FragmentLocalScale = typename WarpDequantizer::FragmentLocalScale;
+
    /// Temporary accumulator to facilitate staged-accumulation
    FragmentC tmp_accum_;

    /// Pair of A fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentA warp_loaded_frag_A_[2];
-    WarpTransformedFragmentA warp_transformed_frag_A_[2];
+    WarpTransformedFragmentA warp_frag_A_[2];

    /// Pair of B fragments used to overlap shared memory loads and math instructions
-    WarpLoadedFragmentB warp_loaded_frag_B_[2];
-    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+    WarpLoadedFragmentB warp_loaded_frag_B_;
+    WarpTransformedFragmentB warp_frag_B_[2];
+
+    /// channel-wise quant params
+    FragmentCodeScaleZp warp_frag_code_scale_;
+    FragmentCodeScaleZp warp_frag_code_zp_;
+    FragmentSuperScale warp_frag_super_scale_;
+
+    /// group-wise quant params
+    FragmentLocalScale warp_frag_local_scale_;
  };

+  using ElementA = typename IteratorA::Element;
+  using ElementB = typename IteratorB::Element;
+  using LayoutDetailsForB = kernel::LayoutDetailsB<ElementA, ElementB, ArchTag>;
+
+  static constexpr bool IsTileInterleaveLayout =
+      layout::IsColumnMajorTileInterleave<typename LayoutDetailsForB::Layout>::value;
+  static_assert(!IsTileInterleaveLayout || (IsTileInterleaveLayout && (Shape::kK == LayoutDetailsForB::ThreadblockK)),
+      "Layout K must match threadblockK");

 private:

@@ -202,17 +237,18 @@ public:
  /// Iterator to write threadblock-scoped tile of B operand to shared memory
  SmemIteratorB smem_iterator_B_;

+  /// Accessor for extra quant params for B
+  QuantParamsAccessor quant_params_accessor_B_;
+
+  // Wint2 unzip operator
+  WarpDequantizer warp_dequantizer_;
+
  /// Shared memory write stage index
  int smem_write_stage_idx_;

  /// Shared memory read stage index
  int smem_read_stage_idx_;

-  uint8_t* column_wise_smem_ptr_B_;
-
-  uint8_t* smem_zipped_ptr_B_;
-  int smem_zipped_bytes_per_stage_B_;
-
 public:

  /// Construct from tensor references
@@ -226,10 +262,15 @@ public:
      int warp_idx,
      ///< ID of each thread within a warp
      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+  ) : Base(shared_storage, thread_idx, warp_idx, lane_idx),
      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      quant_params_accessor_B_(shared_storage.operand_quant_params_B.data(), thread_idx, warp_idx, lane_idx),
+      warp_dequantizer_(quant_params_accessor_B_.super_scale_ref(),
+                        quant_params_accessor_B_.local_scale_ref(),
+                        quant_params_accessor_B_.code_scale_ref(),
+                        quant_params_accessor_B_.code_zp_ref(),
+                        (warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN)) / Base::WarpCount::kM, lane_idx),
      smem_write_stage_idx_(0),
      smem_read_stage_idx_(0)
  {
@@ -250,11 +291,6 @@ public:
        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
    this->warp_tile_iterator_B_.add_tile_offset(
        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-
-    column_wise_smem_ptr_B_ = shared_storage.operand_zipped_B_ptr();
-
-    smem_zipped_ptr_B_ = column_wise_smem_ptr_B_ + Base::SharedStorage::kColumnWiseParamsRows * ZippedShapeB::kColumn;
-    smem_zipped_bytes_per_stage_B_ = Base::SharedStorage::kZippedRowsPerStages * ZippedShapeB::kColumn;
  }

  /// Advance shared memory read-iterators to the next stage
@@ -266,28 +302,22 @@ public:
    if (smem_read_stage_idx_ == Base::kStages) {
      // Wrap back around to the 'start' of the circular buffer in shared memory
      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
-      // this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpLoadIterationsForB, 0});
      smem_read_stage_idx_ = 0;
    }
-    this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
  }

  /// Advance global memory read-iterators and shared memory write-iterators to the stage
-  template <typename TileDequanterB>
  CUTLASS_DEVICE
-  void advance_smem_write_stage(
-    IteratorA &iterator_A,
-    IteratorB &iterator_B,
-    TileDequanterB &tile_dequanter_B)
+  void advance_smem_write_stage(IteratorA &iterator_A, IteratorB &iterator_B)
  {
    // Advance global iterators
    iterator_A.add_tile_offset({0, 1});
-    //iterator_B.add_tile_offset({1, 0});
-    tile_dequanter_B.AddTileOffset({1, 0});
+    iterator_B.add_tile_offset({1, 0});

    // Advance shared iterators
    smem_iterator_A_.add_tile_offset({0, 1});
-    //smem_iterator_B_.add_tile_offset({1, 0});
+    smem_iterator_B_.add_tile_offset({1, 0});

    // Increment shared memory write stage index
    ++smem_write_stage_idx_;
@@ -295,7 +325,7 @@ public:
    if (smem_write_stage_idx_ == Base::kStages) {
      // Wrap back around to the 'start' of the circular buffer in shared memory
      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-      //smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
      smem_write_stage_idx_ = 0;
    }
  }
@@ -338,9 +368,14 @@ public:
    }
  }

-  template <bool GlobalToSharedB>
  CUTLASS_DEVICE
  void copy_tiles_and_advance_B(IteratorB &iterator_B, int group_start_B = 0) {
+    if constexpr (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      if (threadIdx.x >= IteratorB::ThreadMap::kThreads) {
+        return;
+      }
+    }
+
    iterator_B.set_iteration_index(group_start_B *
                                   IteratorB::kAccessesPerVector);
    this->smem_iterator_B_.set_iteration_index(group_start_B);
@@ -360,13 +395,14 @@ public:
        CUTLASS_PRAGMA_UNROLL
        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
          auto gmem_ptr = iterator_B.get();
+          bool is_valid = (threadIdx.x < IteratorB::ThreadMap::kThreads) ? iterator_B.valid() : false;

          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::copy_zfill<kSrcBytes, kCacheOpB, GlobalToSharedB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, is_valid);
          } else {
-            cutlass::arch::copy<kSrcBytes, kCacheOpB, GlobalToSharedB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, is_valid);
          }

          ++iterator_B;
@@ -375,7 +411,6 @@ public:
        ++this->smem_iterator_B_;
      }
    }
-    __syncthreads();
  }

  CUTLASS_DEVICE
@@ -399,8 +434,6 @@ public:
            IteratorA::ThreadMap::kElementsPerAccess /
            IteratorA::kAccessesPerVector / 8;

-        int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-
        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
            dst_ptr + v, iterator_A.get(), iterator_A.valid());

@@ -411,9 +444,12 @@ public:
    }
  }

-  template <bool GlobalToSharedB, bool InitStage>
  CUTLASS_DEVICE
  void copy_tiles_and_advance_per_stage_B(IteratorB &iterator_B) {
+    if (threadIdx.x >= IteratorB::ThreadMap::kThreads) {
+      return;
+    }
+
    iterator_B.set_iteration_index(0);
    this->smem_iterator_B_.set_iteration_index(0);

@@ -433,35 +469,23 @@ public:
            IteratorB::ThreadMap::kElementsPerAccess /
            IteratorB::kAccessesPerVector / 8;

-        if (InitStage) {
-          cutlass::arch::copy_zfill<kSrcBytes, kCacheOpB, GlobalToSharedB>(
-              dst_ptr + v, iterator_B.get(), iterator_B.valid());
-        } else {
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::copy_zfill<kSrcBytes, kCacheOpB, GlobalToSharedB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          } else {
-            cutlass::arch::copy<kSrcBytes, kCacheOpB, GlobalToSharedB>(
-                dst_ptr + v, gmem_ptr, iterator_B.valid());
-          }
-        }
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr + v, iterator_B.get(), iterator_B.valid());

        ++iterator_B;
      }

      ++this->smem_iterator_B_;
    }
-    __syncthreads();
  }

  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
-  template <typename TileDequanterB>
  CUTLASS_DEVICE
  void prologue(
    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
-    TileDequanterB &tile_dequanter_B,
+    QuantArguments &mma_quant_args, ///< iterators for extra quant params for B
    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
  {
    // Issue several complete stages
@@ -476,11 +500,18 @@ public:
      copy_tiles_and_advance_per_stage_A(iterator_A);

      // Async copy zipped B to shared memory.
-      tile_dequanter_B.Load(smem_zipped_ptr_B_ + (stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                            column_wise_smem_ptr_B_, stage);
+      copy_tiles_and_advance_per_stage_B(iterator_B);
+
+      // Async copy other quantized params to shared memory, local_scale, code_scale, code_zp, super_scale.
+      if (stage == 0) {
+        quant_params_accessor_B_.copy_tiles_and_advance_per_stage<true>(mma_quant_args, stage);
+      } else {
+        quant_params_accessor_B_.copy_tiles_and_advance_per_stage<false>(mma_quant_args, stage);
+      }

      // Move to the next write stage
-      advance_smem_write_stage(iterator_A, iterator_B, tile_dequanter_B);
+      advance_smem_write_stage(iterator_A, iterator_B);
+      quant_params_accessor_B_.advance_smem_write_stage(mma_quant_args);

      // Defines the boundary of a stage of cp.async.
      cutlass::arch::cp_async_fence();
@@ -510,6 +541,10 @@ public:
        ++last_smem_iterator_A;
      }

+      if (threadIdx.x >= IteratorB::ThreadMap::kThreads) {
+        return;
+      }
+
      /// Iterator to write threadblock-scoped tile of B operand to shared memory
      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
      typename IteratorB::AccessType zero_B;
@@ -542,57 +577,57 @@ public:
  }

  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
-  template <typename TileDequanterB>
  CUTLASS_DEVICE
  void mac_loop_iter(
    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
    FragmentC &accum,               ///< [in|out] destination accumulator tile
    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
-    TileDequanterB &tile_dequanter_B, ///< [in|out] tile dequantizer for B operand
-    int &gemm_k_iterations, ///< [in|out] number of threadblock mainloop iterations remaining
+    QuantArguments &mma_quant_args, ///< iterators for extra quant params for B
+    int &gemm_k_iterations,         ///< [in|out] number of threadblock mainloop iterations remaining
    int stage)
  {
+    const int mma_stage = stage - Base::kStages + 1;
+
    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
    CUTLASS_PRAGMA_UNROLL
    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
-      // CUTLASS_TRACE_DEVICE(" [MMa] stage=%d, warp_mma_k=%d", stage, warp_mma_k);
+
+      int warp_k_compute_offset_B = warp_mma_k % Base::kWarpGemmIterationsPerLoadForB;
+
+      if (warp_k_compute_offset_B == Base::kWarpGemmIterationsPerLoadForB - 1) {
+        // Load the next warp-tile's B fragment from shared memory
+        this->warp_tile_iterator_B_.set_kgroup_index(((warp_mma_k + 1) % Base::kWarpGemmIterations) / Base::kWarpLoadIterationsForB);
+        this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
+        ++this->warp_tile_iterator_B_;
+      }
+
+      // load next-tile of group-wise local_scale from shared memory
+      if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+        warp_dequantizer_.load(pipe_state.warp_frag_local_scale_);
+      }

      // Load the next warp-tile's A fragment from shared memory
      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_frag_A_[(warp_mma_k + 1) % 2]);
      ++this->warp_tile_iterator_A_;

-      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-        // Unpack and dequant the first stage of B.
-        int unpack_stage = stage - Base::kStages + 2;
-        tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_ + (unpack_stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                                          column_wise_smem_ptr_B_, unpack_stage);
-
-        // Copy dequatized data to shared memory used by mma core.
-        copy_tiles_and_advance_per_stage_B<false, false>(iterator_B);
-      }
-
-      // Load the next warp-tile's B fragment from shared memory
-      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
-      ++this->warp_tile_iterator_B_;
-
-      // Except for the first warp-tile, all warp-tiles convert their incoming shared memory fragments as necessary
-      if (warp_mma_k > 0) {
-        warp_mma_.transform(
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
-          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_loaded_frag_B_[warp_mma_k % 2]);
-      }
+      // dequantizes next warp-tile
+      warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
+                                   pipe_state.warp_frag_code_scale_,
+                                   pipe_state.warp_frag_code_zp_,
+                                   pipe_state.warp_frag_super_scale_,
+                                   pipe_state.warp_loaded_frag_B_,
+                                   pipe_state.warp_frag_B_[(warp_mma_k + 1) % 2],
+                                   ((warp_mma_k == Base::kWarpGemmIterations - 1) ? (mma_stage + 1) : mma_stage) * Shape::kK,
+                                   (warp_mma_k + 1) % Base::kWarpGemmIterationsPerLoadForB);

      // Execute the current warp-tile of MMA operations
-      if (Detail::kStagedAccumulation) {
+      if constexpr (Detail::kStagedAccumulation) {
        warp_mma_(
          pipe_state.tmp_accum_,
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.warp_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_frag_B_[warp_mma_k % 2],
          pipe_state.tmp_accum_
        );

@@ -604,22 +639,22 @@ public:
      } else {
        warp_mma_(
          accum,
-          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
-          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
-          accum
-        );
+          pipe_state.warp_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_frag_B_[warp_mma_k % 2],
+          accum);
      }

      // Except for the last warp-tile, all warp-tiles issue their share of
      // global->shared fragment copies
      if (warp_mma_k < Base::kWarpGemmIterations - 1) {
        int group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;

        copy_tiles_and_advance_A(iterator_A, group_start_iteration_A);
+        copy_tiles_and_advance_B(iterator_B, group_start_iteration_B);

        if (warp_mma_k == 0) {
-          tile_dequanter_B.Load(smem_zipped_ptr_B_ + (stage % Base::kStages) * smem_zipped_bytes_per_stage_B_,
-                                column_wise_smem_ptr_B_, stage);
+          quant_params_accessor_B_.copy_tiles_and_advance_per_stage<false>(mma_quant_args, stage);
        }
      }

@@ -628,9 +663,15 @@ public:
      //   - moves to the next global fetch stage
      if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
        // Performs the last warp-tile's share of global->shared fragment copies
-        int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+        if constexpr (Detail::AsyncCopyIterationsPerStageA >= Base::kWarpGemmIterations) {
+          int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          copy_tiles_and_advance_A(iterator_A, group_start_iteration_A);
+        }

-        copy_tiles_and_advance_A(iterator_A, group_start_iteration_A);
+        if constexpr (Detail::AsyncCopyIterationsPerStageB >= Base::kWarpGemmIterations) {
+          int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+          copy_tiles_and_advance_B(iterator_B, group_start_iteration_B);
+        }

        // Inserts a memory fence between stages of cp.async instructions.
        cutlass::arch::cp_async_fence();
@@ -639,69 +680,66 @@ public:
        gmem_wait();

        // Move to the next global fetch stage
-        advance_smem_write_stage(iterator_A, iterator_B, tile_dequanter_B);
+        advance_smem_write_stage(iterator_A, iterator_B);
+        quant_params_accessor_B_.advance_smem_write_stage(mma_quant_args);
+
        advance_smem_read_stage();
+        int byte_offset = quant_params_accessor_B_.advance_smem_read_stage();
+        warp_dequantizer_.add_pointer_offset(byte_offset);

        // Disable global fetching when done with global fetch iterations
        --gemm_k_iterations;
        iterator_A.clear_mask(gemm_k_iterations == 0);
-        iterator_B.clear_mask(gemm_k_iterations == (-Base::kStages + 1));
-      }
-
-      // The last warp-tile also converts the shared memory fragments used by
-      // the first warp-tile of the next iteration, if necessary (so we can
-      // immediately start issuing MMA instructions at the top of the loop )
-      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-        warp_mma_.transform(
-          pipe_state.warp_transformed_frag_A_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2],
-          pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+        quant_params_accessor_B_.clear_mask(mma_quant_args, gemm_k_iterations == 0);
      }
    }
  }

  /// Perform the specified number of threadblock mainloop iterations of matrix
  /// multiply-accumulate.  Assumes prologue has been initiated.
-  template <typename TileDequanterB>
  CUTLASS_DEVICE
  void gemm_iters(
      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
      FragmentC &accum,             ///< [in|out] accumulator tile
      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
-      IteratorB &iterator_B,
-      TileDequanterB &tile_dequanter_B)        ///< [in|out] iterator over B operand in global memory
+      IteratorB &iterator_B,        ///< [in|out] iterator over B operand in global memory
+      QuantArguments &mma_quant_args)
  {
    PipeState pipe_state;

-    // Unpack and dequant the first stage of B.
-    tile_dequanter_B.UnpackAndDequant(smem_zipped_ptr_B_, column_wise_smem_ptr_B_, 0);
-
    // Disable global fetching if done with global fetch iterations
    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B.clear_mask(gemm_k_iterations == (-Base::kStages + 1));
-
-    // Load first warp-tile's A fragment from shared memory
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
-    ++this->warp_tile_iterator_A_;
-
-    // Copy dequatized data to shared memory used by mma core.
-    copy_tiles_and_advance_per_stage_B<false, true>(iterator_B);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    quant_params_accessor_B_.clear_mask(mma_quant_args, gemm_k_iterations == 0);

    // Load first warp-tile's B fragment from shared memory
    this->warp_tile_iterator_B_.set_kgroup_index(0);
-    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_);
    ++this->warp_tile_iterator_B_;

-    // Transform, if necessary, the first warp-tile's shared memory fragments
-    warp_mma_.transform(
-      pipe_state.warp_transformed_frag_A_[0],
-      pipe_state.warp_transformed_frag_B_[0],
-      pipe_state.warp_loaded_frag_A_[0],
-      pipe_state.warp_loaded_frag_B_[0]);
+    warp_dequantizer_.load(pipe_state.warp_frag_code_scale_,
+                           pipe_state.warp_frag_code_zp_,
+                           pipe_state.warp_frag_super_scale_);

-    if (Detail::kStagedAccumulation) {
+    warp_dequantizer_.load(pipe_state.warp_frag_local_scale_);
+
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
+
+    // Dequantize B to in register
+    warp_dequantizer_.dequantize(pipe_state.warp_frag_local_scale_,
+                                 pipe_state.warp_frag_code_scale_,
+                                 pipe_state.warp_frag_code_zp_,
+                                 pipe_state.warp_frag_super_scale_,
+                                 pipe_state.warp_loaded_frag_B_,
+                                 pipe_state.warp_frag_B_[0],
+                                 0,
+                                 0);
+
+    if constexpr (Detail::kStagedAccumulation) {
      pipe_state.tmp_accum_.clear();
    }

@@ -715,13 +753,13 @@ public:
        accum,
        iterator_A,
        iterator_B,
-        tile_dequanter_B,
+        mma_quant_args,
        gemm_k_iterations,
        stage);
      stage += 1;
    }

-    if (Detail::kStagedAccumulation) {
+    if constexpr (Detail::kStagedAccumulation) {
      plus<FragmentC> plus_accum;
      accum = plus_accum(accum, pipe_state.tmp_accum_);
    }
@@ -761,14 +799,12 @@ public:
    else
    {
      this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
-      //this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
-      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
+      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
    }
    smem_read_stage_idx_ = smem_write_stage_idx_;
  }

  /// Perform a threadblock-scoped matrix multiply-accumulate, pre-load B to shared memory.
-  template <typename TileDequanterB>
  CUTLASS_DEVICE
  void operator()(
      ///< problem size of GEMM
@@ -779,13 +815,13 @@ public:
      IteratorA iterator_A,
      ///< iterator over B operand in global memory
      IteratorB iterator_B,
-      ///< pre-load and dequantize B to shared memory
-      TileDequanterB tile_dequanter_B,
+      ///< iterators for extra quant params for B
+      QuantArguments mma_quant_args,
      ///< initial value of accumulator
      FragmentC const &src_accum) {

    // Prologue (start fetching iterations of global fragments into shared memory)
-    prologue(iterator_A, iterator_B, tile_dequanter_B, gemm_k_iterations);
+    prologue(iterator_A, iterator_B, mma_quant_args, gemm_k_iterations);

    // Wait until we have at least one completed global fetch stage
    gmem_wait();
@@ -794,7 +830,7 @@ public:
    accum = src_accum;

    // Perform the MAC-iterations
-    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, tile_dequanter_B);
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B, mma_quant_args);
  }
 };

--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_params_accessor.h
@@ -0,0 +1,315 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/trace.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Original data type
+    typename T,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterators over super scales in global memory
+    typename IteratorSuperScale_,
+    /// Iterators over super scales in shared memory
+    typename SmemIteratorSuperScale_,
+    /// Iterators over local scales in global memory
+    typename IteratorLocalScale_,
+    /// Iterators over local scales in shared memory
+    typename SmemIteratorLocalScale_,
+    /// Iterators over code scales and zps in global memory
+    typename IteratorCodeScaleZp_,
+    /// Iterators over code scales and zps in shared memory
+    typename SmemIteratorCodeScaleZp_,
+    /// Number of stages,
+    int Stages_,
+    /// Group size for quantization
+    int GroupSize_>
+class Wint2ParamsAccessor {
+public:
+  static_assert(platform::is_same<T, half_t>::value || platform::is_same<T, bfloat16_t>::value,
+        "T must be fp16 or bf16");
+
+  using ElementType = T;
+  using Shape = Shape_;
+
+  using IteratorSuperScale = IteratorSuperScale_;
+  using SmemIteratorSuperScale = SmemIteratorSuperScale_;
+
+  using IteratorLocalScale = IteratorLocalScale_;
+  using SmemIteratorLocalScale = SmemIteratorLocalScale_;
+
+  using IteratorCodeScaleZp = IteratorCodeScaleZp_;
+  using SmemIteratorCodeScaleZp = SmemIteratorCodeScaleZp_;
+
+  constexpr static int kStages = Stages_;
+  constexpr static int kGroupSize = GroupSize_;
+
+  using ElementSuperScale = typename IteratorSuperScale::Element;
+  using LayoutSuperScale = typename IteratorSuperScale::Layout;
+
+  /// local_scale uint4 and group-wise
+  using ElementLocalScale = typename IteratorLocalScale::Element;
+  using LayoutLocalScale = typename IteratorLocalScale::Layout;
+  static_assert(platform::is_same<ElementLocalScale, uint4b_t>::value,
+        "local_scale's type must be uint4b_t.");
+
+  using ElementCodeScaleZp = typename IteratorCodeScaleZp::Element;
+  using LayoutCodeScaleZp = typename IteratorCodeScaleZp::Layout;
+
+  /// 2 uint4b_t values are stored in a single uint8_t
+  constexpr static int kStagesPerLocalScaleLoad = 2 * kGroupSize / Shape::kK;
+  constexpr static int kLocalScaleRows =
+      IteratorLocalScale::Shape::kRow * IteratorLocalScale::Shape::kColumn * sizeof_bits<ElementLocalScale>::value / 8 / Shape::kN;
+
+  using SmemElement = uint8_t;
+  constexpr static int kSmemRows =
+      kLocalScaleRows * kStages + sizeof(ElementSuperScale) + sizeof(ElementCodeScaleZp) * 2;
+  constexpr static int kSmemColumns = Shape::kN;
+
+  using QuantParamsShape = MatrixShape<kSmemRows, kSmemColumns>;
+
+  constexpr static int kSuperScaleSmemOffset = 0;
+  constexpr static int kCodeScaleSmemOffset = kSmemColumns * sizeof(ElementSuperScale);
+  constexpr static int kCodeZpSmemOffset = kCodeScaleSmemOffset + kSmemColumns * sizeof(ElementCodeScaleZp);
+  constexpr static int kLocalScaleSmemOffset = kCodeZpSmemOffset + kSmemColumns * sizeof(ElementCodeScaleZp);
+
+  /// TensorRef type for loading element from a tensor
+  using SuperTensorRef = cutlass::TensorRef<ElementSuperScale, LayoutSuperScale>;
+  using LocalTensorRef = cutlass::TensorRef<ElementLocalScale, LayoutLocalScale>;
+  using CodeTensorRef = cutlass::TensorRef<ElementCodeScaleZp, LayoutCodeScaleZp>;
+
+  struct Arguments {
+    IteratorSuperScale iterator_super_scale;
+    IteratorLocalScale iterator_local_scale;
+    IteratorCodeScaleZp iterator_code_scale;
+    IteratorCodeScaleZp iterator_code_zp;
+
+    int local_scale_pointer_offset;
+
+    CUTLASS_DEVICE
+    Arguments(IteratorSuperScale iterator_super_scale,
+              IteratorLocalScale iterator_local_scale,
+              IteratorCodeScaleZp iterator_code_scale,
+              IteratorCodeScaleZp iterator_code_zp,
+              int local_scale_pointer_offset)
+      : iterator_super_scale(iterator_super_scale),
+        iterator_local_scale(iterator_local_scale),
+        iterator_code_scale(iterator_code_scale),
+        iterator_code_zp(iterator_code_zp),
+        local_scale_pointer_offset(local_scale_pointer_offset) {}
+  };
+
+private:
+  //
+  // Data members
+  //
+
+  /// Begin address of shared memory
+  uint8_t* smem_pointer_;
+
+  /// Iterator to write threadblock-scoped tile of super scale operand to shared memory
+  SmemIteratorSuperScale smem_iterator_super_scale_;
+  /// Iterator to write threadblock-scoped tile of local scale operand to shared memory
+  SmemIteratorLocalScale smem_iterator_local_scale_;
+  /// Iterator to write threadblock-scoped tile of code scale operand to shared memory
+  SmemIteratorCodeScaleZp smem_iterator_code_scale_;
+  /// Iterator to write threadblock-scoped tile of code zp operand to shared memory
+  SmemIteratorCodeScaleZp smem_iterator_code_zp_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+  CUTLASS_DEVICE
+  ElementSuperScale* get_super_scale_smem_ptr() {
+    return reinterpret_cast<ElementSuperScale*>(smem_pointer_ + kSuperScaleSmemOffset);
+  }
+
+  CUTLASS_DEVICE
+  ElementLocalScale* get_local_scale_smem_ptr() {
+    return reinterpret_cast<ElementLocalScale*>(smem_pointer_ + kLocalScaleSmemOffset);
+  }
+
+  CUTLASS_DEVICE
+  ElementCodeScaleZp* get_code_scale_smem_ptr() {
+    return reinterpret_cast<ElementCodeScaleZp*>(smem_pointer_ + kCodeScaleSmemOffset);
+  }
+
+  CUTLASS_DEVICE
+  ElementCodeScaleZp* get_code_zp_smem_ptr() {
+    return reinterpret_cast<ElementCodeScaleZp*>(smem_pointer_ + kCodeZpSmemOffset);
+  }
+
+public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  Wint2ParamsAccessor(
+      ///< prointer of shared memory
+      uint8_t* smem_pointer,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+    : smem_pointer_(smem_pointer),
+      smem_iterator_super_scale_(LayoutSuperScale(IteratorSuperScale::Shape::kColumn),
+          get_super_scale_smem_ptr(), {1, IteratorSuperScale::Shape::kColumn}, thread_idx),
+      smem_iterator_local_scale_(LayoutLocalScale(IteratorLocalScale::Shape::kColumn),
+          get_local_scale_smem_ptr(), {1, IteratorLocalScale::Shape::kColumn}, thread_idx),
+      smem_iterator_code_scale_(LayoutCodeScaleZp(IteratorCodeScaleZp::Shape::kColumn),
+          get_code_scale_smem_ptr(), {1, IteratorCodeScaleZp::Shape::kColumn}, thread_idx),
+      smem_iterator_code_zp_(LayoutCodeScaleZp(IteratorCodeScaleZp::Shape::kColumn),
+          get_code_zp_smem_ptr(), {1, IteratorCodeScaleZp::Shape::kColumn}, thread_idx),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0) {}
+
+  CUTLASS_DEVICE
+  SuperTensorRef super_scale_ref() {
+    return {get_super_scale_smem_ptr(), LayoutSuperScale(IteratorSuperScale::Shape::kColumn)};
+  }
+
+  CUTLASS_DEVICE
+  LocalTensorRef local_scale_ref() {
+    return {get_local_scale_smem_ptr(), LayoutLocalScale(IteratorLocalScale::Shape::kColumn)};
+  }
+
+  CUTLASS_DEVICE
+  CodeTensorRef code_scale_ref() {
+    return {get_code_scale_smem_ptr(), LayoutCodeScaleZp(IteratorCodeScaleZp::Shape::kColumn)};
+  }
+
+  CUTLASS_DEVICE
+  CodeTensorRef code_zp_ref() {
+    return {get_code_zp_smem_ptr(), LayoutCodeScaleZp(IteratorCodeScaleZp::Shape::kColumn)};
+  }
+
+  template <bool IsFirstStage>
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_per_stage(Arguments &quant_args, int stage) {
+    if constexpr (IsFirstStage) {
+      // Load channel-wise super_scale to shared memory, which only needs to be done once.
+      typename IteratorSuperScale::Fragment tb_frag_super_scale;
+      tb_frag_super_scale.clear();
+      quant_args.iterator_super_scale.load(tb_frag_super_scale);
+      this->smem_iterator_super_scale_.store(tb_frag_super_scale);
+
+      // Load channel-wise code_scale to shared memory, which only needs to be done once.
+      typename IteratorCodeScaleZp::Fragment tb_frag_code_scale;
+      tb_frag_code_scale.clear();
+      quant_args.iterator_code_scale.load(tb_frag_code_scale);
+      this->smem_iterator_code_scale_.store(tb_frag_code_scale);
+
+      // Load channel-wise code_zp to shared memory, which only needs to be done once.
+      typename IteratorCodeScaleZp::Fragment tb_frag_code_zp;
+      tb_frag_code_zp.clear();
+      quant_args.iterator_code_zp.load(tb_frag_code_zp);
+      this->smem_iterator_code_zp_.store(tb_frag_code_zp);
+    }
+
+    if ((stage % kStagesPerLocalScaleLoad) == 0) {
+      // Load group-wise local_scale to shared memory, which only needs to be done at each stage.
+      // Since 2 uint4b_t values of local_scale are saved in a single uint8_t, local_scale needs to be loaded once every two stages.
+      using AccessType = typename IteratorLocalScale::AccessType;
+      cutlass::arch::CacheOperation::Kind const kCacheOp = (sizeof_bits<AccessType>::value == 128)
+          ? cutlass::arch::CacheOperation::Global : cutlass::arch::CacheOperation::Always;
+
+      quant_args.iterator_local_scale.set_iteration_index(0);
+      this->smem_iterator_local_scale_.set_iteration_index(0);
+
+      // Async Copy for local_scale
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < IteratorLocalScale::ThreadMap::Iterations::kCount; ++j) {
+        AccessType *dst_ptr =
+            reinterpret_cast<AccessType *>(this->smem_iterator_local_scale_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorLocalScale::kAccessesPerVector; ++v) {
+          auto gmem_ptr = quant_args.iterator_local_scale.get();
+
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorLocalScale::Element>::value *
+              IteratorLocalScale::ThreadMap::kElementsPerAccess /
+              IteratorLocalScale::kAccessesPerVector / 8;
+
+              cutlass::arch::cp_async<kSrcBytes, kCacheOp>(
+                  dst_ptr + v, gmem_ptr, quant_args.iterator_local_scale.valid());
+        }
+        ++quant_args.iterator_local_scale;
+      }
+      ++this->smem_iterator_local_scale_;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(Arguments &quant_args) {
+    if (smem_write_stage_idx_ % kStagesPerLocalScaleLoad == 0) {
+      // Advance global iterators
+      quant_args.iterator_local_scale.add_pointer_offset(quant_args.local_scale_pointer_offset);
+
+      // Advance shared iterators
+      int smem_pointer_offset = IteratorLocalScale::Shape::kRow * IteratorLocalScale::Shape::kColumn;
+      smem_iterator_local_scale_.add_pointer_offset(smem_pointer_offset);
+    }
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == kStagesPerLocalScaleLoad * kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      int pointer_offset = - kStages * IteratorLocalScale::Shape::kRow * IteratorLocalScale::Shape::kColumn;
+      smem_iterator_local_scale_.add_pointer_offset(pointer_offset);
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
+  CUTLASS_DEVICE
+  int advance_smem_read_stage() {
+    int byte_offset = 0;
+
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ % kStagesPerLocalScaleLoad == 0) {
+      byte_offset = kLocalScaleRows * kSmemColumns;
+    }
+
+    if (smem_read_stage_idx_ == kStagesPerLocalScaleLoad * kStages) {
+      smem_read_stage_idx_ = 0;
+      byte_offset = - (kStages - 1) * kLocalScaleRows * kSmemColumns;
+    }
+
+    return byte_offset;
+  }
+
+  CUTLASS_DEVICE
+  int clear_mask(Arguments &quant_args, bool cond) {
+    quant_args.iterator_local_scale.clear_mask(cond);
+  }
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h
@@ -1,130 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "cutlass/gemm_coord.h"
-#include "cutlass/trace.h"
-
-#include "cutlass_extensions/gemm/threadblock/wint2x_unzip.h"
-
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-
-template <typename ElementT, typename ScaleElementT, int Rows, int Columns,
-          int Stages, int NumThreads, WintQuantMethod Method>
-struct TileDequanter {
-  using WeightQuantTraits = WintQuantTraits<ElementT, Method>;
-  using MmaElementT = typename WeightQuantTraits::MmaWeightType;
-  using QuantArguments = typename WeightQuantTraits::Arguments;
-
-  using UnzipAndDequantFunctor =
-      UnzipAndDequantFunctor<MmaElementT, Method, Rows, Columns, NumThreads>;
-
-  static constexpr bool kUseSharedMemory = true;
-
-  static constexpr int kRows = Rows;
-  static constexpr int kColumns = Columns;
-  static constexpr int kStages = Stages;
-
-  MmaElementT *out_smem_ptr{nullptr};
-
-  char *pointer{nullptr};
-  int64_t ldm{0};
-  cutlass::MatrixCoord tb_offset;
-  cutlass::MatrixCoord extent;
-
-  ScaleElementT *super_scale_ptr{nullptr};
-  cutlass::MatrixCoord tb_offset_scale;
-
-  QuantArguments quant_args;
-
-  int64_t block_start_rows[kStages];
-  bool need_preload{true};
-  UnzipAndDequantFunctor unzip_functor;
-
-  CUTLASS_DEVICE
-  TileDequanter(MmaElementT *out_smem_ptr, char *pointer, int64_t ldm,
-                const cutlass::MatrixCoord &extent,
-                const cutlass::MatrixCoord &tb_offset,
-                ScaleElementT *super_scale_ptr,
-                const cutlass::MatrixCoord &tb_offset_scale,
-                const QuantArguments &quant_args)
-      : out_smem_ptr(out_smem_ptr), pointer(pointer), ldm(ldm), extent(extent),
-        tb_offset(tb_offset), super_scale_ptr(super_scale_ptr),
-        tb_offset_scale(tb_offset_scale), quant_args(quant_args) {}
-
-  CUTLASS_DEVICE
-  MmaElementT *GetOutPtr() { return out_smem_ptr; }
-
-  CUTLASS_DEVICE
-  void AddTileOffset(const cutlass::MatrixCoord &tile_offset) {
-    tb_offset.row() += tile_offset.row() * kRows;
-    tb_offset.column() += tile_offset.column() * kColumns;
-    tb_offset_scale.column() += tile_offset.column() * kColumns;
-  }
-
-  CUTLASS_DEVICE
-  void Load(uint8_t *zipped_smem_ptr, uint8_t *column_wise_smem_ptr, int stage) {
-    int zipped_row = WeightQuantTraits::CaclPackedDim(tb_offset.row());
-    if (tb_offset.row() >= extent.row() ||
-        tb_offset.column() >= extent.column()) {
-      return;
-    }
-
-    block_start_rows[stage % kStages] = tb_offset.row();
-
-    using ZippedT = typename WeightQuantTraits::WeightType;
-    ZippedT *in_ptr = reinterpret_cast<ZippedT *>(pointer) + zipped_row * ldm +
-                      tb_offset.column();
-    ScaleElementT *scale_ptr = super_scale_ptr + tb_offset_scale.column();
-
-    if constexpr (Method == WintQuantMethod::kWeightOnlyInt2) {
-      const uint8_t *local_scale_ptr = quant_args.local_scale_ptr +
-                                       (tb_offset.row() / 128) * ldm +
-                                       tb_offset_scale.column();
-      const float *code_scale_ptr =
-          quant_args.code_scale_ptr + tb_offset_scale.column();
-      const float *code_zp_ptr =
-          quant_args.code_zp_ptr + tb_offset_scale.column();
-
-      typename UnzipAndDequantFunctor::Arguments args(zipped_smem_ptr, column_wise_smem_ptr);
-      unzip_functor.LoadAsync(in_ptr, local_scale_ptr, code_scale_ptr, code_zp_ptr,
-                              scale_ptr, &args, ldm, need_preload);
-      need_preload = false;
-    } else {
-      // CUTLASS_TRACE_DEVICE("Not Supported!");
-    }
-  }
-
-  CUTLASS_DEVICE
-  void UnpackAndDequant(uint8_t *zipped_smem_ptr, uint8_t *column_wise_smem_ptr, int stage) {
-    int64_t block_start_row = block_start_rows[stage % kStages];
-    if (block_start_row >= extent.row()) {
-      return;
-    }
-
-    if constexpr (Method == WintQuantMethod::kWeightOnlyInt2) {
-      typename UnzipAndDequantFunctor::Arguments args(zipped_smem_ptr, column_wise_smem_ptr);
-      unzip_functor.ComputeVectorized(args, out_smem_ptr, block_start_row);
-    } else {
-      // CUTLASS_TRACE_DEVICE("Not Supported!");
-    }
-  }
-};
-
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/default_mma_tensor_op.h
@@ -41,12 +41,9 @@
 #include "cutlass_extensions/arch/mma.h"
 #include "cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h"

-namespace cutlass
-{
-namespace gemm
-{
-namespace warp
-{
+namespace cutlass {
+namespace gemm {
+namespace warp {

 /////////////////////////////////////////////////////////////////////////////////////////////////

@@ -81,7 +78,7 @@ private:
    // Shape for computing the FP16s
    using ComputeInstructionShape = InstructionShape_;

-    // Chosen so we get K=16 for int8 and K=32 for int4.
+    // Chosen so we get K=16 for int8, K=32 for int4, K=64 for int2.
    static constexpr int LoadInstructionK = 128 / sizeof_bits<ElementB>::value;

    // Shape for loading the narrow data type from shared memory
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_compute_B_with_f16.h
@@ -58,15 +58,12 @@

 /////////////////////////////////////////////////////////////////////////////////////////////////

-namespace cutlass
-{
-namespace gemm
-{
-namespace warp
-{
+namespace cutlass {
+namespace gemm {
+namespace warp {

 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+/// Structure to compute the matrix product targeting Tensor Cores, for the case when A is floating point and B is quantized integer.
 template <
    /// Size of the Gemm problem - concept: gemm::GemmShape<>
    typename Shape_,
@@ -297,6 +294,235 @@ public:
    }
 };

+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Structure to compute the matrix product targeting Tensor Cores, for the case when A is floating point and B is quantized integer.
+/// Specialization for B of uint2b_t.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    typename Policy_,
+    /// Instruction shape to override shared memory iterators with
+    typename SharedMemoryInstructionShape_,
+    /// Number of partitions along K dimension
+    int PartitionsK_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+class MmaTensorOpComputeBWithF16<
+    Shape_,
+    ElementA_,
+    LayoutA_,
+    uint2b_t,
+    LayoutB_,
+    ElementC_,
+    LayoutC_,
+    Policy_,
+    SharedMemoryInstructionShape_,
+    PartitionsK_,
+    AccumulatorsInRowMajor>
+{
+public:
+    /// Shape of warp-level matrix operation (concept: GemmShape)
+    using Shape = Shape_;
+
+    /// Data type of multiplicand A
+    using ElementA = ElementA_;
+
+    /// Layout of multiplicand A
+    using LayoutA = LayoutA_;
+
+    /// Data type of multiplicand B
+    using ElementB = uint2b_t;
+
+    /// Layout of multiplicand B
+    using LayoutB = LayoutB_;
+
+    /// Data type of accumulator matrix C
+    using ElementC = ElementC_;
+
+    /// Layout of accumulator matrix C
+    using LayoutC = LayoutC_;
+
+    /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+    using Policy = Policy_;
+
+    /// Underlying matrix multiply operator (concept: arch::Mma)
+    using ArchMmaOperator = typename Policy::Operator;
+
+    /// Indicates math operator
+    using MathOperator = typename ArchMmaOperator::Operator;
+
+    /// Architecture tag from underlying instruction
+    using ArchTag = typename ArchMmaOperator::ArchTag;
+    static_assert((platform::is_same<typename ArchMmaOperator::ElementA, half_t>::value
+                      && platform::is_same<typename ArchMmaOperator::ElementB, half_t>::value)
+            || (platform::is_same<typename ArchMmaOperator::ElementA, bfloat16_t>::value
+                && platform::is_same<typename ArchMmaOperator::ElementB, bfloat16_t>::value
+                && ArchTag::kMinComputeCapability >= 80),
+        "MmaTensorOpCvtBToA only supports underlying HMMA/QMMA");
+
+    static_assert(platform::is_same<ElementA, half_t>::value
+            || (platform::is_same<ElementA, bfloat16_t>::value && ArchTag::kMinComputeCapability >= 80),
+        "MmaTensorOpCvtBToA only supports Fp16 A or Bf16 A on Ampere+");
+
+    /// Indicates class of matrix operator
+    using OperatorClass = arch::OpClassTensorOp;
+
+    /// Shape of underlying instruction
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    /// Instruction shape to override shared memory iterators with
+    using SharedMemoryInstructionShape = SharedMemoryInstructionShape_;
+
+    static_assert(
+        SharedMemoryInstructionShape::kM == InstructionShape::kM, "M dimension of compute instruction must match load");
+    static_assert(
+        SharedMemoryInstructionShape::kN == InstructionShape::kN, "N dimension of compute instruction must match load");
+
+    static constexpr int kExpansionFactor = SharedMemoryInstructionShape::kK / InstructionShape::kK;
+
+    static_assert(!(Shape::kK % SharedMemoryInstructionShape::kK), "");
+
+    /// Complex transform on A operand
+    static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+    /// Complex transform on B operand
+    static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+    /// Number of threads participating in warp-level matrix product
+    static int const kThreadCount = 32;
+
+    /// Number of partitions along K dimension
+    static int const kPartitionsK = PartitionsK_;
+
+public:
+    /// Iterates over the A operand in memory
+    using IteratorA
+        = MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+            MatrixShape<InstructionShape::kM, InstructionShape::kK>, Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+    /// Storage for A tile
+    using FragmentA = typename IteratorA::Fragment;
+
+    /// Storage for transformed A tile
+    using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+    /// Iterates over the B operand in memory
+    using IteratorB = MmaTensorOpMultiplicandTileIterator<MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB,
+        LayoutB, MatrixShape<SharedMemoryInstructionShape::kK, InstructionShape::kN>, Policy::OpDelta::kRow,
+        kThreadCount, kPartitionsK>;
+
+    /// Storage for B tile
+    using FragmentB = typename IteratorB::Fragment;
+
+    /// Storage for transformed B tile
+    using TransformedFragmentB =
+        Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / kExpansionFactor>;
+
+    /// Iterates over the C operand in memory
+    using IteratorC = MmaTensorOpAccumulatorTileIterator<MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+        typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+    /// Storage for C tile
+    using FragmentC = typename IteratorC::Fragment;
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<(Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+        (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN>;
+
+public:
+    /// Underlying matrix multiply operator (concept: arch::Mma)
+    ArchMmaOperator mma;
+
+public:
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_DEVICE
+    MmaTensorOpComputeBWithF16() {}
+
+    /// Performs a warp-level matrix multiply-accumulate operation
+    CUTLASS_DEVICE
+    void operator()(FragmentC& D, TransformedFragmentA const& A, TransformedFragmentB const& B, FragmentC const& C) const
+    {
+
+        using MmaOperandA = typename ArchMmaOperator::FragmentA;
+        using MmaOperandB = typename ArchMmaOperator::FragmentB;
+        using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+        D = C;
+
+        MmaOperandA const* ptr_A = reinterpret_cast<MmaOperandA const*>(&A);
+        MmaOperandB const* ptr_B = reinterpret_cast<MmaOperandB const*>(&B);
+        MmaOperandC* ptr_D = reinterpret_cast<MmaOperandC*>(&D);
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+        // Serpentine visitation order maximizing reuse of Rb
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n)
+        {
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < MmaIterations::kRow; ++m)
+            {
+
+                int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+                if (AccumulatorsInRowMajor)
+                { // matrix B is reordered
+                    mma(ptr_D[n + m_serpentine * MmaIterations::kColumn], ptr_A[m_serpentine], ptr_B[n],
+                        ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+                }
+                else
+                {
+                    mma(ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], ptr_B[n],
+                        ptr_D[m_serpentine + n * MmaIterations::kRow]);
+                }
+            }
+        }
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+        // Serpentine visitation order maximizing reuse of Ra
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m)
+        {
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < MmaIterations::kColumn; ++n)
+            {
+
+                int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+                if (AccumulatorsInRowMajor)
+                { // matrix B is reordered
+                    mma(ptr_D[n_serpentine + m * MmaIterations::kColumn], ptr_A[m], ptr_B[n_serpentine],
+                        ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+                }
+                else
+                {
+                    mma(ptr_D[m + n_serpentine * MmaIterations::kRow], ptr_A[m], ptr_B[n_serpentine],
+                        ptr_D[m + n_serpentine * MmaIterations::kRow]);
+                }
+            }
+        }
+#else
+        assert(0);
+#endif
+    }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace warp
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/warp/mma_tensorop_wint2x_dequantizer.h
@@ -0,0 +1,442 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines iterators used by warp-level matrix multiply operations
+  targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+namespace detail {
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<bfloat16_t> {
+    using Type = __nv_bfloat16;
+    using DualType = __nv_bfloat162;
+};
+
+template <>
+struct DataTypeTraits<half_t> {
+    using Type = __half;
+    using DualType = __half2;
+};
+
+template <typename T, int N, typename Enable = void>
+struct LocalScaleConverter {
+    using FragmentSource = Array<uint8_t, N>;
+    using FragmentResult = Array<T, N>;
+
+    CUTLASS_DEVICE
+    static void Apply(FragmentSource const& local_scale_frag,
+                      FragmentResult const& super_scale_frag,
+                      FragmentResult& scale_frag,
+                      int shift_bit) {
+        constexpr uint32_t kLocalScaleMask = 0xf;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N; ++i) {
+            int32_t shifted_value = (static_cast<int32_t>(local_scale_frag[i]) >> shift_bit) & kLocalScaleMask;
+            scale_frag[i] = static_cast<T>(shifted_value) * super_scale_frag[i];
+        }
+    }
+};
+
+template <int N>
+struct LocalScaleConverter<half_t, N, typename platform::enable_if<N % 4 == 0>::type> {
+    using FragmentSource = Array<uint8_t, N>;
+    using FragmentResult = Array<half_t, N>;
+
+    CUTLASS_DEVICE
+    static void Apply(FragmentSource const& local_scale_frag,
+                      FragmentResult const& super_scale_frag,
+                      FragmentResult& scale_frag,
+                      int shift_bit) {
+        constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+        constexpr uint32_t MASK = 0x000f000f;
+        // 2^10 = 1024
+        constexpr uint32_t I4s_TO_FP16s_MAGIC_NUM = 0x64006400;
+
+        // -2^10 = -1024
+        constexpr uint32_t FP16_BIAS = 0xE400E400;
+        // 1.0
+        constexpr uint32_t FP16_ONE = 0x3C003C00;
+
+        __half2* scale_ptr = reinterpret_cast<__half2 *>(&scale_frag);
+        __half2 const* super_scale_ptr = reinterpret_cast<__half2 const*>(&super_scale_frag);
+
+        uint32_t const* local_scale_ptr = reinterpret_cast<uint32_t const*>(&local_scale_frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / 4; ++i) {
+            int i4s = local_scale_ptr[i] >> shift_bit;
+
+            // unpack: 0, 1
+            int32_t low = __byte_perm(i4s, i4s, 0xF1F0);
+            int32_t unpack0 = lop3<immLut>(low, MASK, I4s_TO_FP16s_MAGIC_NUM);
+            // unpack: 2, 3
+            int32_t high = __byte_perm(i4s, i4s, 0xF3F2);
+            int32_t unpack1 = lop3<immLut>(high, MASK, I4s_TO_FP16s_MAGIC_NUM);
+
+            __half2 scale0 = __hfma2(*reinterpret_cast<__half2*>(&unpack0),
+                                     *reinterpret_cast<const __half2*>(&FP16_ONE),
+                                     *reinterpret_cast<const __half2*>(&FP16_BIAS));
+            __half2 scale1 = __hfma2(*reinterpret_cast<__half2*>(&unpack1),
+                                     *reinterpret_cast<const __half2*>(&FP16_ONE),
+                                     *reinterpret_cast<const __half2*>(&FP16_BIAS));
+
+            scale_ptr[2 * i] = __hmul2(scale0, super_scale_ptr[2 * i]);
+            scale_ptr[2 * i + 1] = __hmul2(scale1, super_scale_ptr[2 * i + 1]);
+        }
+    }
+};
+
+template <int N>
+struct LocalScaleConverter<bfloat16_t, N, typename platform::enable_if<N % 4 == 0>::type> {
+    using FragmentSource = Array<uint8_t, N>;
+    using FragmentResult = Array<bfloat16_t, N>;
+
+    CUTLASS_DEVICE
+    static void Apply(FragmentSource const& local_scale_frag,
+                      FragmentResult const& super_scale_frag,
+                      FragmentResult& scale_frag,
+                      int shift_bit) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && defined(ENABLE_BF16))
+        constexpr uint32_t immLut = (0xF0 & 0xCC) | 0xAA;
+        constexpr uint32_t MASK = 0x000F000F;
+        constexpr uint32_t I4s_TO_BF16s_MAGIC_NUM = 0x43004300;
+
+        constexpr uint32_t BF16_BIAS = 0xC300C300;
+        constexpr uint32_t BF16_ONE = 0x3F803F80;
+
+        __nv_bfloat162* scale_ptr = reinterpret_cast<__nv_bfloat162 *>(&scale_frag);
+        __nv_bfloat162 const* super_scale_ptr = reinterpret_cast<__nv_bfloat162 const*>(&super_scale_frag);
+
+        uint32_t const* local_scale_ptr = reinterpret_cast<uint32_t const*>(&local_scale_frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / 4; ++i) {
+            int i4s = local_scale_ptr[i] >> shift_bit;
+
+            // unpack: 0, 1
+            int32_t low = __byte_perm(i4s, i4s, 0xF1F0);
+            int32_t unpack0 = lop3<immLut>(low, MASK, I4s_TO_BF16s_MAGIC_NUM);
+            // unpack: 2, 3
+            int32_t high = __byte_perm(i4s, i4s, 0xF3F2);
+            int32_t unpack1 = lop3<immLut>(high, MASK, I4s_TO_BF16s_MAGIC_NUM);
+
+            nv_bfloat162 scale0 = __hfma2(*reinterpret_cast<nv_bfloat162*>(&unpack0),
+                                          *reinterpret_cast<const nv_bfloat162*>(&BF16_ONE),
+                                          *reinterpret_cast<const nv_bfloat162*>(&BF16_BIAS));
+            nv_bfloat162 scale1 = __hfma2(*reinterpret_cast<nv_bfloat162*>(&unpack1),
+                                          *reinterpret_cast<const nv_bfloat162*>(&BF16_ONE),
+                                          *reinterpret_cast<const nv_bfloat162*>(&BF16_BIAS));
+
+            scale_ptr[2 * i] = __hmul2(scale0, super_scale_ptr[2 * i]);
+            scale_ptr[2 * i + 1] = __hmul2(scale1, super_scale_ptr[2 * i + 1]);
+        }
+#else
+        // Slow path not implemented here on purpose. If we need to do HMMA on older arch, scale conversion should
+        // happen before scales are stored to shared memory and we should use the fp16 dequantizer. This will avoid
+        // numerous conversion instructions in GEMM main loop.
+        arch::device_breakpoint();
+#endif
+    }
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Matrix multiply operator
+    typename MmaOperator_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of Scale elements
+    typename ElementOperand_,
+    /// Layout of operand
+    typename Layout_,
+    /// Group size for quantization
+    int GroupSize_,
+    ///
+    typename Enable = void>
+class MmaTensorOpWin2xDequantizer {
+    //static_assert(false, "Not Supported!");
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Bfloat specialization for Ampere
+template <
+    /// Underlying matrix multiply operator (concept: MmaTensorOp)
+    typename MmaOperator_,
+    /// Shape of the warp level matrix multiply (concept: GemmShape)
+    typename Shape_,
+    /// Data type of Scale elements
+    typename ElementOperand_,
+    /// Group size for quantization
+    int GroupSize_>
+class MmaTensorOpWin2xDequantizer<
+    MmaOperator_,
+    Shape_,
+    Operand::kB,
+    ElementOperand_,
+    layout::RowMajor,
+    GroupSize_>
+    //typename platform::enable_if<MmaOperator_::ArchTag::kMinComputeCapability >= 80
+    //    && platform::is_same<typename MmaOperator_::ArchMmaOperator::LayoutB, layout::ColumnMajor>::value>::type>
+{
+public:
+    static_assert(platform::is_same<ElementOperand_, half_t>::value || platform::is_same<ElementOperand_, bfloat16_t>::value,
+        "T must be fp16 or bf16");
+
+    /// Mma Operator
+    using MmaOperator = MmaOperator_;
+
+    // The architecture specific mma ooperator being used
+    using ArchMmaOperator = typename MmaOperator::ArchMmaOperator;
+
+    // Mma Instruction Shape
+    using InstructionShape = typename ArchMmaOperator::Shape;
+
+    /// Warp mma shape
+    using Shape = Shape_;
+
+    /// Type of mma operand
+    using ElementOperand = ElementOperand_;
+
+    /// Layout of the scales in shared memory
+    using Layout = layout::RowMajor;
+
+    /// Group size for quantization
+    static constexpr int kGroupSize = GroupSize_;
+
+    /// Type of input
+    using ElementB = typename MmaOperator::FragmentB::Element;
+    static_assert(platform::is_same<ElementB, uint2b_t>::value, "ElementB must be uint2b_t");
+
+    /// Type of the scales
+    using ElementLocalScale = uint4b_t;
+    using ElementSuperScale = ElementOperand;
+    using ElementCodeScaleZp = float;
+
+    // Fragment to hold scale data to apply to B before mma
+    // We need 1 fp16 per matrix iteration in the N dimension
+    static constexpr int kWarpIterationsAlongN = MmaOperator::MmaIterations::kColumn;
+
+    // use uint8_t to save 2 4-bits local scales
+    using FragmentLocalScale = Array<uint8_t, kWarpIterationsAlongN>;
+    using FragmentSuperScale = Array<ElementSuperScale, kWarpIterationsAlongN>;
+    using FragmentCodeScaleZp = Array<ElementCodeScaleZp, kWarpIterationsAlongN>;
+
+    /// Fragment to hold B data before Mma
+    using FragmentInput = Array<ElementB, MmaOperator::FragmentB::kElements>;
+
+    // This is the ratio of the load instruction vs the compute instruction.
+    static constexpr int kExpansionFactor = MmaOperator::IteratorB::InstructionShape::kRow / InstructionShape::kK;
+
+    static constexpr int kNumPacks = sizeof_bits<uint8_t>::value / sizeof_bits<ElementB>::value;
+    static constexpr int kUnpackFactor = MmaOperator::FragmentB::kElements / (kWarpIterationsAlongN * kNumPacks);
+    static constexpr int kUnpackInterval = kExpansionFactor / kUnpackFactor;
+
+    /// Unpack 4 uint2b_t values compreseed in a uint8_t to floating points.
+    using Uint2Converter = FastInterleavedAndBiasedNumericArrayConverter<
+        ElementOperand, ElementB, MmaOperator::FragmentB::kElements / kUnpackFactor>;
+    using FragmentInputUnpack = typename Uint2Converter::result_type;
+
+    /// Fragment to hold internal scales before Mma
+    using FragmentScale = Array<ElementOperand, FragmentLocalScale::kElements>;
+
+    /// Fragment of dequantized B
+    using FragmentOutput = Array<ElementOperand, MmaOperator::FragmentB::kElements / kExpansionFactor>;
+
+    /// TensorRef type for loading element from a tensor
+    using SuperTensorRef = cutlass::TensorRef<ElementSuperScale, Layout>;
+    using LocalTensorRef = cutlass::TensorRef<ElementLocalScale, Layout>;
+    using CodeTensorRef = cutlass::TensorRef<ElementCodeScaleZp, Layout>;
+
+private:
+    //
+    // Data members
+    //
+
+    uint8_t* pointer_local_scale_;
+    ElementCodeScaleZp* pointer_code_scale_;
+    ElementCodeScaleZp* pointer_code_zp_;
+    ElementSuperScale* pointer_super_scale_;
+
+    //FragmentInputUnpack unpacked_frag_;
+    FragmentScale scale_frag_;
+
+public:
+    CUTLASS_DEVICE
+    MmaTensorOpWin2xDequantizer(SuperTensorRef smem_super_scale,
+                                LocalTensorRef smem_local_scale,
+                                CodeTensorRef smem_code_scale,
+                                CodeTensorRef smem_code_zp,
+                                int warp_idx_n,
+                                int lane_idx) {
+        int warp_offset = warp_idx_n * Shape::kN;
+        int quad = lane_idx / 4;
+        int thread_offset = warp_offset + quad;
+        pointer_super_scale_ = smem_super_scale.data() + thread_offset;
+        pointer_code_scale_ = smem_code_scale.data() + thread_offset;
+        pointer_code_zp_ = smem_code_zp.data() + thread_offset;
+        pointer_local_scale_ = reinterpret_cast<uint8_t *>(smem_local_scale.data()) + thread_offset;
+    }
+
+    /// Channel-wise params, need to load just once
+    CUTLASS_DEVICE
+    void load(FragmentCodeScaleZp& code_scale_frag,
+              FragmentCodeScaleZp& code_zp_frag,
+              FragmentSuperScale& super_scale_frag) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < kWarpIterationsAlongN; ++mma_n_iter) {
+            super_scale_frag[mma_n_iter] = pointer_super_scale_[mma_n_iter * InstructionShape::kN]; // bank conflict
+            code_scale_frag[mma_n_iter] = pointer_code_scale_[mma_n_iter * InstructionShape::kN];
+            code_zp_frag[mma_n_iter] = pointer_code_zp_[mma_n_iter * InstructionShape::kN];
+        }
+    }
+
+    /// Group-wise params, need to load multiple times
+    CUTLASS_DEVICE
+    void load(FragmentLocalScale& local_scale_frag) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < kWarpIterationsAlongN; ++mma_n_iter) {
+            local_scale_frag[mma_n_iter] = pointer_local_scale_[mma_n_iter * InstructionShape::kN]; // bank conflict
+        }
+    }
+
+    CUTLASS_DEVICE
+    void dequantize(const FragmentLocalScale& local_scale_frag,
+                    const FragmentCodeScaleZp& code_scale_frag,
+                    const FragmentCodeScaleZp& code_zp_frag,
+                    const FragmentSuperScale& super_scale_frag,
+                    const FragmentInput& input_frag,
+                    FragmentOutput& output_frag,
+                    int tb_offset_k,
+                    int warp_k_compute_offset) {
+        if constexpr (kUnpackInterval != 1) {
+            // unsupport now
+            arch::device_breakpoint();
+        }
+
+        typename Uint2Converter::source_type source_frag;
+
+        int in_offset = warp_k_compute_offset * kUnpackInterval;
+
+        uint8_t const* ptr_input = reinterpret_cast<uint8_t const*>(&input_frag);
+        uint8_t* ptr_source = reinterpret_cast<uint8_t *>(&source_frag);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < kWarpIterationsAlongN; ++mma_n_iter) {
+            ptr_source[mma_n_iter] = ptr_input[mma_n_iter * kUnpackFactor + in_offset];
+        }
+        FragmentInputUnpack unpacked_frag = Uint2Converter::convert(source_frag, code_scale_frag, code_zp_frag);
+
+        // dequantize local_scale
+        if (warp_k_compute_offset == 0) {
+            using LocalScaleConverter = detail::LocalScaleConverter<ElementOperand, FragmentLocalScale::kElements>;
+
+            // special for TileRows = 64
+            int local_scale_shift = (((tb_offset_k / kGroupSize) + 1) & 1) * 4;
+            LocalScaleConverter::Apply(local_scale_frag, super_scale_frag, scale_frag_, local_scale_shift);
+        }
+
+        // unscale
+        // After applying LOP3 optimizations for performance, the B operand requires data rearrangement.
+        // reorder: [0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15]
+        const int kWarpIterationsAlongK = FragmentOutput::kElements / kWarpIterationsAlongN;
+
+        using Type = typename detail::DataTypeTraits<ElementOperand>::Type;
+        using DualType = typename detail::DataTypeTraits<ElementOperand>::DualType;
+
+        Type* output_ptr = reinterpret_cast<Type *>(&output_frag);
+        DualType const* unpacked_ptr = reinterpret_cast<DualType const*>(&unpacked_frag);
+        DualType const* scale_ptr = reinterpret_cast<DualType const*>(&scale_frag_);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n_iter = 0; mma_n_iter < kWarpIterationsAlongN; mma_n_iter += 2) {
+            int mapped_idx_base = (mma_n_iter / 2) * kWarpIterationsAlongK;
+
+            DualType scalex2 = scale_ptr[mma_n_iter / 2];
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_k_iter = 0; mma_k_iter < kWarpIterationsAlongK; ++mma_k_iter) {
+                DualType unpacked_valuex2 = unpacked_ptr[mapped_idx_base + mma_k_iter];
+                DualType scaled_value = __hmul2(unpacked_valuex2, scalex2);
+                output_ptr[mma_n_iter * kWarpIterationsAlongK + mma_k_iter] = scaled_value.x;
+                output_ptr[(mma_n_iter + 1) * kWarpIterationsAlongK + mma_k_iter] = scaled_value.y;
+            }
+        }
+    }
+
+    /// Add an offset to pointer in units of elements.
+    /// Only group-wise params needs.
+    CUTLASS_DEVICE
+    void add_pointer_offset(int64_t const& offset) {
+        pointer_local_scale_ += offset;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace warp
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
--- a/custom_ops/gpu_ops/cutlass_extensions/interleaved_numeric_conversion.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/interleaved_numeric_conversion.h
@@ -39,18 +39,25 @@
 #include "cutlass/array.h"
 #include "cutlass/half.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"

-namespace cutlass
-{
+namespace cutlass {
+
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}

 // This converter is meant to be used with data interleaved in a 32-bit register where the even elements are in the low
 // bits and the odd elemeents are in the high bits of the register. In addition, it assumes elements were originally
 // signed and had a bias of 2**(b-1) added (where b is the number of bits in the type) to make all numbers unsigned.
 // This converter will uninterleave the data and subtract the bias while converting to the result type.
 template <typename T, typename S, int N>
-struct FastInterleavedAndBiasedNumericArrayConverter
-{
-};
+struct FastInterleavedAndBiasedNumericArrayConverter;

 template <>
 struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint8_t, 4>
@@ -440,6 +447,329 @@ struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint4b_t, N>
    }
 };

+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<half_t, uint2b_t, 16>
+{
+    using result_type = Array<half_t, 16>;
+    using source_type = Array<uint2b_t, 16>;
+
+    using ScaleComputeT = float;
+    using code_type = Array<ScaleComputeT, 4>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, ScaleComputeT code_scale, ScaleComputeT code_zp)
+    {
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        // 2^23 = 8388608
+        static constexpr uint32_t FP32_BASE = 0x4B000000;
+
+        float fp32_intermediates[4];
+        uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+        fp32_intermediates_casted[0] = __byte_perm(i8s, FP32_BASE, 0x7650);
+        fp32_intermediates_casted[1] = __byte_perm(i8s, FP32_BASE, 0x7651);
+        fp32_intermediates_casted[2] = __byte_perm(i8s, FP32_BASE, 0x7652);
+        fp32_intermediates_casted[3] = __byte_perm(i8s, FP32_BASE, 0x7653);
+
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[0]) : "r"(fp32_intermediates_casted[0]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[1]) : "r"(fp32_intermediates_casted[1]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[2]) : "r"(fp32_intermediates_casted[2]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[3]) : "r"(fp32_intermediates_casted[3]), "r"(FP32_BASE));
+
+        int32_t decode_value[4];
+        ScaleComputeT new_code_zp = code_zp + 0.5f;
+
+        decode_value[0] = __float2int_rd(fmaf(fp32_intermediates[0], code_scale, new_code_zp));
+        decode_value[1] = __float2int_rd(fmaf(fp32_intermediates[1], code_scale, new_code_zp));
+        decode_value[2] = __float2int_rd(fmaf(fp32_intermediates[2], code_scale, new_code_zp));
+        decode_value[3] = __float2int_rd(fmaf(fp32_intermediates[3], code_scale, new_code_zp));
+
+        return convert_impl(decode_value);
+    }
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, code_type const& code_scale, code_type const& code_zp)
+    {
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        // 2^23 = 8388608
+        static constexpr uint32_t FP32_BASE = 0x4B000000;
+
+        float fp32_intermediates[4];
+        uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+        fp32_intermediates_casted[0] = __byte_perm(i8s, FP32_BASE, 0x7650);
+        fp32_intermediates_casted[1] = __byte_perm(i8s, FP32_BASE, 0x7651);
+        fp32_intermediates_casted[2] = __byte_perm(i8s, FP32_BASE, 0x7652);
+        fp32_intermediates_casted[3] = __byte_perm(i8s, FP32_BASE, 0x7653);
+
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[0]) : "r"(fp32_intermediates_casted[0]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[1]) : "r"(fp32_intermediates_casted[1]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[2]) : "r"(fp32_intermediates_casted[2]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[3]) : "r"(fp32_intermediates_casted[3]), "r"(FP32_BASE));
+
+        int32_t decode_value[4];
+
+        decode_value[0] = __float2int_rd(fmaf(fp32_intermediates[0], code_scale[0], code_zp[0] + 0.5f));
+        decode_value[1] = __float2int_rd(fmaf(fp32_intermediates[1], code_scale[1], code_zp[1] + 0.5f));
+        decode_value[2] = __float2int_rd(fmaf(fp32_intermediates[2], code_scale[2], code_zp[2] + 0.5f));
+        decode_value[3] = __float2int_rd(fmaf(fp32_intermediates[3], code_scale[3], code_zp[3] + 0.5f));
+
+        return convert_impl(decode_value);
+    }
+
+    CUTLASS_DEVICE
+    static result_type convert_impl(int32_t* decode_value)
+    {
+        result_type result;
+        static constexpr uint32_t immLut = (0xF0 & 0xCC) | 0xAA;
+
+        static constexpr uint32_t MASK = 0x003F003F;
+        // 2^10 = 1024
+        static constexpr uint32_t EX = 0x64006400;
+
+        uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+
+        int32_t q0 = __byte_perm(decode_value[0], decode_value[1], 0x5410);
+        int32_t q1 = __byte_perm(decode_value[2], decode_value[3], 0x5410);
+
+        h[0] = lop3<immLut>(q0 >> 9, MASK, EX);
+        h[1] = lop3<immLut>(q0 >> 6, MASK, EX);
+        h[2] = lop3<immLut>(q0 >> 3, MASK, EX);
+        h[3] = lop3<immLut>(q0, MASK, EX);
+
+        h[4] = lop3<immLut>(q1 >> 9, MASK, EX);
+        h[5] = lop3<immLut>(q1 >> 6, MASK, EX);
+        h[6] = lop3<immLut>(q1 >> 3, MASK, EX);
+        h[7] = lop3<immLut>(q1, MASK, EX);
+
+        // 1024 + 32 = 1056
+        static constexpr uint32_t SUB = 0x64206420;
+
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[3]) : "r"(h[3]), "r"(SUB));
+
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[4]) : "r"(h[4]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[5]) : "r"(h[5]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[6]) : "r"(h[6]), "r"(SUB));
+        asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[7]) : "r"(h[7]), "r"(SUB));
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s, ScaleComputeT code_scale, ScaleComputeT code_zp)
+    {
+        return convert(s, code_scale, code_zp);
+    }
+};
+
+template <>
+struct FastInterleavedAndBiasedNumericArrayConverter<bfloat16_t, uint2b_t, 16>
+{
+    using result_type = Array<bfloat16_t, 16>;
+    using source_type = Array<uint2b_t, 16>;
+
+    using ScaleComputeT = float;
+    using code_type = Array<ScaleComputeT, 4>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, ScaleComputeT code_scale, ScaleComputeT code_zp)
+    {
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        // 2^23 = 8388608
+        static constexpr uint32_t FP32_BASE = 0x4B000000;
+
+        float fp32_intermediates[4];
+        uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+        fp32_intermediates_casted[0] = __byte_perm(i8s, FP32_BASE, 0x7650);
+        fp32_intermediates_casted[1] = __byte_perm(i8s, FP32_BASE, 0x7651);
+        fp32_intermediates_casted[2] = __byte_perm(i8s, FP32_BASE, 0x7652);
+        fp32_intermediates_casted[3] = __byte_perm(i8s, FP32_BASE, 0x7653);
+
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[0]) : "r"(fp32_intermediates_casted[0]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[1]) : "r"(fp32_intermediates_casted[1]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[2]) : "r"(fp32_intermediates_casted[2]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[3]) : "r"(fp32_intermediates_casted[3]), "r"(FP32_BASE));
+
+        int32_t decode_value[4];
+        ScaleComputeT new_code_zp = code_zp + 0.5f;
+
+        decode_value[0] = __float2int_rd(fmaf(fp32_intermediates[0], code_scale, new_code_zp));
+        decode_value[1] = __float2int_rd(fmaf(fp32_intermediates[1], code_scale, new_code_zp));
+        decode_value[2] = __float2int_rd(fmaf(fp32_intermediates[2], code_scale, new_code_zp));
+        decode_value[3] = __float2int_rd(fmaf(fp32_intermediates[3], code_scale, new_code_zp));
+
+        return convert_impl(decode_value);
+    }
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, code_type const& code_scale, code_type const& code_zp)
+    {
+        uint32_t const i8s = reinterpret_cast<uint32_t const&>(source);
+
+        // 2^23 = 8388608
+        static constexpr uint32_t FP32_BASE = 0x4B000000;
+
+        float fp32_intermediates[4];
+        uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+        fp32_intermediates_casted[0] = __byte_perm(i8s, FP32_BASE, 0x7650);
+        fp32_intermediates_casted[1] = __byte_perm(i8s, FP32_BASE, 0x7651);
+        fp32_intermediates_casted[2] = __byte_perm(i8s, FP32_BASE, 0x7652);
+        fp32_intermediates_casted[3] = __byte_perm(i8s, FP32_BASE, 0x7653);
+
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[0]) : "r"(fp32_intermediates_casted[0]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[1]) : "r"(fp32_intermediates_casted[1]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[2]) : "r"(fp32_intermediates_casted[2]), "r"(FP32_BASE));
+        asm volatile("sub.f32 %0, %1, %2;\n" : "=r"(fp32_intermediates_casted[3]) : "r"(fp32_intermediates_casted[3]), "r"(FP32_BASE));
+
+        int32_t decode_value[4];
+
+        decode_value[0] = __float2int_rd(fmaf(fp32_intermediates[0], code_scale[0], code_zp[0] + 0.5f));
+        decode_value[1] = __float2int_rd(fmaf(fp32_intermediates[1], code_scale[1], code_zp[1] + 0.5f));
+        decode_value[2] = __float2int_rd(fmaf(fp32_intermediates[2], code_scale[2], code_zp[2] + 0.5f));
+        decode_value[3] = __float2int_rd(fmaf(fp32_intermediates[3], code_scale[3], code_zp[3] + 0.5f));
+
+        return convert_impl(decode_value);
+    }
+
+    CUTLASS_DEVICE
+    static result_type convert_impl(int32_t* decode_value)
+    {
+        result_type result;
+
+        static constexpr uint32_t immLut = (0xF0 & 0xCC) | 0xAA;
+        static constexpr uint32_t MASK = 0x003F003F;
+        // 2^7 = 128
+        static constexpr uint32_t EX = 0x43004300;
+
+        uint32_t* h = reinterpret_cast<uint32_t*>(&result);
+
+        int32_t q0 = __byte_perm(decode_value[0], decode_value[1], 0x5410);
+        int32_t q1 = __byte_perm(decode_value[2], decode_value[3], 0x5410);
+
+        h[0] = lop3<immLut>(q0 >> 9, MASK, EX);
+        h[1] = lop3<immLut>(q0 >> 6, MASK, EX);
+        h[2] = lop3<immLut>(q0 >> 3, MASK, EX);
+        h[3] = lop3<immLut>(q0, MASK, EX);
+
+        h[4] = lop3<immLut>(q1 >> 9, MASK, EX);
+        h[5] = lop3<immLut>(q1 >> 6, MASK, EX);
+        h[6] = lop3<immLut>(q1 >> 3, MASK, EX);
+        h[7] = lop3<immLut>(q1, MASK, EX);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(ENABLE_BF16))
+        // 128 + 32 = 160
+        static constexpr uint32_t SUB = 0x43204320;
+
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[1]) : "r"(h[1]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[3]) : "r"(h[3]), "r"(SUB));
+
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[4]) : "r"(h[4]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[5]) : "r"(h[5]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[6]) : "r"(h[6]), "r"(SUB));
+        asm volatile("sub.bf16x2 %0, %1, %2;\n" : "=r"(h[7]) : "r"(h[7]), "r"(SUB));
+#else
+        // 1.0
+        static constexpr uint32_t MUL = 0x3F803F80;
+        // -160
+        static constexpr uint32_t ADD = 0xC320C320;
+
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[0]) : "r"(h[0]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[2]) : "r"(h[2]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(MUL), "r"(ADD));
+
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[4]) : "r"(h[4]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[5]) : "r"(h[5]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[6]) : "r"(h[6]), "r"(MUL), "r"(ADD));
+        asm volatile("fma.rn.bf16x2 %0, %1, %2, %3;\n" : "=r"(h[7]) : "r"(h[7]), "r"(MUL), "r"(ADD));
+#endif
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s, ScaleComputeT code_scale, ScaleComputeT code_zp)
+    {
+        return convert(s, code_scale, code_zp);
+    }
+};
+
+template <typename T, int N>
+struct FastInterleavedAndBiasedNumericArrayConverter<T, uint2b_t, N>
+{
+    static_assert(platform::is_same<T, half_t>::value || platform::is_same<T, bfloat16_t>::value,
+        "T must be fp16 or bf16");
+
+    static constexpr int kVecWidth = 16;
+    static_assert(!(N % kVecWidth), "N must be multiple of 16.");
+
+    using result_type = Array<T, N>;
+    using source_type = Array<uint2b_t, N>;
+    using code_type = Array<float, N / kVecWidth>;
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, code_type const& code_scale, code_type const& code_zp)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, kVecWidth>
+            convert_vector_;
+
+        result_type result;
+        using vec_result = Array<scalar_result_type, kVecWidth>;
+        using vec_source = Array<scalar_source_type, kVecWidth>;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / kVecWidth; ++i)
+        {
+            result_ptr[i] = convert_vector_(source_ptr[i], code_scale[i], code_zp[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    static result_type convert(source_type const& source, Array<float, N / 4> const& code_scale, Array<float, N / 4> const& code_zp)
+    {
+        using scalar_result_type = typename result_type::Element;
+        using scalar_source_type = typename source_type::Element;
+        using Converter = FastInterleavedAndBiasedNumericArrayConverter<scalar_result_type, scalar_source_type, kVecWidth>;
+
+        result_type result;
+        using vec_result = typename Converter::result_type;
+        using vec_source = typename Converter::source_type;
+        using vec_code = typename Converter::code_type;
+
+        vec_result* result_ptr = reinterpret_cast<vec_result*>(&result);
+        vec_source const* source_ptr = reinterpret_cast<vec_source const*>(&source);
+        vec_code const* code_scale_ptr = reinterpret_cast<vec_code const*>(&code_scale);
+        vec_code const* code_zp_ptr = reinterpret_cast<vec_code const*>(&code_zp);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < N / kVecWidth; ++i)
+        {
+            result_ptr[i] = Converter::convert(source_ptr[i], code_scale_ptr[i], code_zp_ptr[i]);
+        }
+
+        return result;
+    }
+
+    CUTLASS_DEVICE
+    result_type operator()(source_type const& s, code_type const& code_scale, code_type const& code_zp)
+    {
+        return convert(s, code_scale, code_zp);
+    }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace cutlass
--- a/custom_ops/gpu_ops/cutlass_extensions/wint_type_traits.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/wint_type_traits.h
@@ -125,10 +125,13 @@ struct WintQuantTraits<ElementT, WintQuantMethod::kWeightOnlyInt2> {
  static constexpr int32_t kNumPackedValues = 4;
  static constexpr int32_t kPackedSize = 16;

+  using LocalScaleType = uint4b_t;
+  using CodeScaleZpType = float;
+
  struct Arguments {
-    const uint8_t *local_scale_ptr; // quanted 4-bits
-    const float *code_scale_ptr;
-    const float *code_zp_ptr;
+    uint8_t *local_scale_ptr; // quanted 4-bits
+    float *code_scale_ptr;
+    float *code_zp_ptr;
  };

  CUTLASS_DEVICE
--- a/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_cutlass_kernel.h
@@ -43,7 +43,6 @@
 #include "cutlass/trace.h"

 #include "cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h"
-#include "cutlass_extensions/gemm/threadblock/wint2x_tile_dequanter.h"
 #include "cutlass_extensions/tile_interleaved_layout.h"

 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -775,17 +774,54 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
  template <WintQuantMethod QuantMethod, typename dummy>
  struct KernelRunner<QuantMethod, true, dummy> {
    using WeightQuantTraits = WintQuantTraits<ElementA, QuantMethod>;
-    using QuantArguments = typename WeightQuantTraits::Arguments;
+    using MmaQuantArguments = typename Mma::QuantParamsAccessor::Arguments;

    CUTLASS_DEVICE
-    static QuantArguments get_quant_args(Params const& params, int32_t problem_idx, const int64_t gemm_k, const int64_t gemm_n) {
-      QuantArguments quant_args;
-      if constexpr (QuantMethod == WintQuantMethod::kWeightOnlyInt2) {
-        quant_args.local_scale_ptr = params.local_scale + problem_idx * gemm_k * gemm_n / 128;
-        quant_args.code_scale_ptr = params.code_scale + problem_idx * gemm_n;
-        quant_args.code_zp_ptr = params.code_zp + problem_idx * gemm_n;
-      }
-      return quant_args;
+    static MmaQuantArguments prepare_quant_args(
+        Params const& params, cutlass::gemm::GemmCoord const& threadblock_offset,
+        int64_t problem_idx, const int32_t gemm_k, const int32_t gemm_n, const int thread_idx) {
+      // the begin threadblock_offset of scale, which holds the same column id with C, but with no row id
+      cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
+      cutlass::MatrixCoord tb_offset_local_scale{0, threadblock_offset.n() * 2};
+
+      ElementScale* weight_scale_ptr = params.weight_scales + problem_idx * gemm_n;
+      typename Mma::QuantParamsAccessor::IteratorSuperScale iterator_super_scale(
+          Mma::QuantParamsAccessor::LayoutSuperScale(gemm_n),
+          weight_scale_ptr,
+          {1, gemm_n},
+          thread_idx,
+          tb_offset_scale);
+
+      int local_scale_pointer_offset = ((ThreadblockShape::kK + 127) / 128) * (gemm_n * 2);
+      int64_t offset_in_bytes = problem_idx * gemm_k * gemm_n / 128;
+      uint4b_t *local_scale_ptr = reinterpret_cast<uint4b_t *>(params.local_scale + offset_in_bytes);
+
+      typename Mma::QuantParamsAccessor::IteratorLocalScale iterator_local_scale(
+          Mma::QuantParamsAccessor::LayoutLocalScale(gemm_n * 2),
+          local_scale_ptr,
+          {(gemm_k + 127) / 128, gemm_n * 2},
+          thread_idx,
+          tb_offset_local_scale);
+
+      float* code_scale_ptr = params.code_scale + problem_idx * gemm_n;
+      typename Mma::QuantParamsAccessor::IteratorCodeScaleZp iterator_code_scale(
+          Mma::QuantParamsAccessor::LayoutCodeScaleZp(gemm_n),
+          code_scale_ptr,
+          {1, gemm_n},
+          thread_idx,
+          tb_offset_scale);
+
+      float* code_zp_ptr = params.code_zp + problem_idx * gemm_n;
+      typename Mma::QuantParamsAccessor::IteratorCodeScaleZp iterator_code_zp(
+          Mma::QuantParamsAccessor::LayoutCodeScaleZp(gemm_n),
+          code_zp_ptr,
+          {1, gemm_n},
+          thread_idx,
+          tb_offset_scale);
+
+      MmaQuantArguments mma_quant_args(
+          iterator_super_scale, iterator_local_scale, iterator_code_scale, iterator_code_zp, local_scale_pointer_offset);
+      return mma_quant_args;
    }

    CUTLASS_DEVICE
@@ -814,9 +850,6 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
                  kInterleave >= 1,
          "B must be row major/col major OR col major interleaved.");

-      // LayoutB should be RowMajor
-      using TileDequanterB = cutlass::gemm::threadblock::TileDequanter<ElementA, ElementScale, ThreadblockShape::kK, ThreadblockShape::kN, kStages, kThreadCount, QuantMethod>;
-
      //
      // Problem visitor.
      //
@@ -843,12 +876,6 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
            int(cta_idx % grid_shape.n()) * Mma::Shape::kN,  // NOLINT
            0);

-        // begin address offset for weight_scale.
-        ElementScale* weight_scale_ptr =
-            params.weight_scales ? params.weight_scales + problem_idx * problem_size.n() : nullptr;
-        // the begin threadblock_offset of scale, which holds the same column id with C, but with no row id
-        cutlass::MatrixCoord tb_offset_scale{0, threadblock_offset.n()};
-
        // Load element pointers. Exchange pointers and strides if working on
        // the transpose
        int64_t rows_to_jump = 0;
@@ -866,42 +893,20 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,

        // Compute initial location in logical coordinates
        // the begin threadblock_offset of A, which holds the same row id with C
-        cutlass::MatrixCoord tb_offset_A{
-            threadblock_offset.m(),
-            0,
-        };
+        cutlass::MatrixCoord tb_offset_A{threadblock_offset.m(), 0};

        // begin address offset for B for current problem_idx, totally num_experts problems
        char* byte_ptr_B = ((char*)params.ptr_B) +                 // NOLINT
                           problem_idx * bytes_per_expert_matrix;  // NOLINT
-
+        ElementB* ptr_B = reinterpret_cast<ElementB*>(byte_ptr_B);
        typename LayoutB::LongIndex ldm_B =
            platform::is_same<layout::RowMajor, LayoutB>::value
                ? gemm_n
                : gemm_k * kInterleave;
-        typename LayoutB::LongIndex ldm_B_shared = TileDequanterB::kColumns;

        // the begin threadblock_offset of B, which holds the same column id with C
-        cutlass::MatrixCoord tb_offset_B{0,
-                                         threadblock_offset.n() / kInterleave};
-
+        cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.n() / kInterleave};
        cutlass::MatrixCoord extent_B{problem_size.k() * kInterleave, problem_size.n() / kInterleave};
-        cutlass::MatrixCoord extent_B_shared{TileDequanterB::kRows, TileDequanterB::kColumns};
-
-        MmaElementB* smem_unzip_B_ptr = nullptr;
-        if constexpr (QuantMethod == WintQuantMethod::kWeightOnlyInt2) {
-          smem_unzip_B_ptr = shared_storage.main_loop.operand_unzip_B_ptr();
-        }
-        QuantArguments quant_args = get_quant_args(params, problem_idx, gemm_k, gemm_n);
-        TileDequanterB tile_dequanter_B(smem_unzip_B_ptr,
-                                        byte_ptr_B,
-                                        ldm_B,
-                                        extent_B,
-                                        tb_offset_B,
-                                        weight_scale_ptr,
-                                        tb_offset_scale,
-                                        quant_args);
-        MmaElementB* ptr_B = tile_dequanter_B.GetOutPtr();

        // Compute position within threadblock
        int thread_idx = threadIdx.x;
@@ -914,20 +919,21 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
                                           tb_offset_A);

        typename Mma::IteratorB iterator_B(
-            LayoutB(TileDequanterB::kUseSharedMemory ? ldm_B_shared : ldm_B),
+            LayoutB(ldm_B),
            ptr_B,
-            TileDequanterB::kUseSharedMemory ? extent_B_shared : extent_B,
+            extent_B,
            thread_idx,
-            TileDequanterB::kUseSharedMemory ? cutlass::make_Coord(0, 0) : tb_offset_B);
+            tb_offset_B);
+
+        MmaQuantArguments mma_quant_args = prepare_quant_args(
+            params, threadblock_offset, problem_idx, gemm_k, gemm_n, thread_idx);

        typename Mma::FragmentC accumulators;
-
        accumulators.clear();

        // Broadcast the warp_id computed by lane 0 to ensure dependent code
        // is compiled as warp-uniform.
        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-
        int lane_idx = threadIdx.x % 32;

        //
@@ -950,7 +956,7 @@ struct Wint2xMoeFCGemm : public MoeFCGemm<Mma_, Epilogue_, ThreadblockSwizzle_,
            accumulators,
            iterator_A,
            iterator_B,
-            tile_dequanter_B,
+            mma_quant_args,
            accumulators);

        //
--- a/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
@@ -205,7 +205,7 @@ void generic_moe_gemm_kernelLauncher(const T* A,
      threadblock_count,
      epilogue_op,
      reinterpret_cast<const ElementType*>(A),
-      reinterpret_cast<const CutlassMmaWeightType*>(B),
+      reinterpret_cast<const CutlassMmaKernelType*>(B),
      reinterpret_cast<const ElementType*>(weight_scales),
      reinterpret_cast<const ElementType*>(biases),
      reinterpret_cast<ElementType*>(C),
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_gemm_grouped.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_gemm_grouped.h
@@ -223,14 +223,11 @@ public:
  static Status can_implement(Arguments const &args)
  {
    CUTLASS_TRACE_HOST("W4A8MoeGemmUniversalBase::can_implement()");
-    // printf("--1\n");
    // Initialize static kernel and device properties, if necessary.
    Status result = init_device_props();
-    // printf("--1-2\n");
    if (result != Status::kSuccess) {
      return result;
    }
-    // printf("--2\n");
    dim3 grid = get_grid_shape(args);
    // printf("--grid:%d, %d, %d\n", grid.x, grid.y, grid.z);
    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
@@ -238,7 +235,6 @@ public:
    {
      return Status::kErrorInvalidProblem;
    }
-    // printf("--3\n");
    return GemmKernel::can_implement(args);
  }

@@ -285,18 +281,50 @@ public:
  }


+
  /// Returns the maximum number of active thread blocks per multiprocessor
-  static int maximum_active_blocks()
+  static int maximum_active_blocks(int smem_capacity = -1)
  {
    CUTLASS_TRACE_HOST("W4A8MoeGemmUniversalBase::maximum_active_blocks()");

-    // Initialize static device properties, if necessary
-    if (init_device_props() != Status::kSuccess) {
+    int smem_size = int(sizeof(typename GemmKernel_::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel2<GemmKernel_>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        Kernel2<GemmKernel_>,
+        GemmKernel_::kThreadCount,
+        smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+        << cudaGetErrorString(result));
      return -1;
    }

-    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
-    return sm_occupancy_;
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
  }


@@ -341,8 +369,7 @@ public:

    // Configure grid and block dimensions
    dim3 block(GemmKernel::kThreadCount, 1, 1);
-    // dim3 grid = params_.get_grid_dims();
-        dim3 grid(216, 1, 1);
+    dim3 grid(params_.threadblock_count, 1, 1);

    // Launch kernel
    CUTLASS_TRACE_HOST("  "
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_gemm_config_search.sh
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_gemm_config_search.sh
@@ -21,12 +21,12 @@ rm -rf up_gate_proj_7168_8192.log
 rm -rf down_proj_8192_3584.log
 num_experts=8

-for tokens_per_expert in 12
+for tokens_per_expert in 1 2 4 8 16 20 24 28 32 36 48 64 96 128 160 192 224 256 384 512 768 1024 2048 3072 4096 8192

 do
 wait
-CUDA_VISIBLE_DEVICES=2 ./w4a8_moe_gemm_test ${num_experts} ${up_gate_proj_n} ${up_gate_proj_k} ${tokens_per_expert} 1 0 >> up_gate_proj_${up_gate_proj_n}_${up_gate_proj_k}.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 ./w4a8_moe_gemm_test ${num_experts} ${down_proj_n} ${down_proj_k} ${tokens_per_expert} 1 0 >> down_proj_${down_proj_n}_${down_proj_k}.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 ./w4a8_moe_gemm_test ${num_experts} ${ffn1_n} ${ffn1_k} ${tokens_per_expert} 0 1 >> ffn1_${ffn1_n}_${ffn1_k}.log 2>&1 &
+CUDA_VISIBLE_DEVICES=3 ./w4a8_moe_gemm_test ${num_experts} ${ffn2_n} ${ffn2_k} ${tokens_per_expert} 0 1 >> ffn2_${ffn2_n}_${ffn2_k}.log 2>&1 &
 done
 wait
 echo "#### finish ####"
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_gemm_test.cu
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/w4a8_moe_gemm_test.cu
@@ -996,7 +996,6 @@ int main(int argc, char *argv[]) {
        CutlassTileConfig::CtaShape64x256x64_WarpShape64x64x64,
        CutlassTileConfig::CtaShape32x512x64_WarpShape32x128x64,
        CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64,
-        CutlassTileConfig::CtaShape32x512x64_WarpShape32x128x64,
    };
    std::vector<SplitKStyle> all_split_k_style{SplitKStyle::NO_SPLIT_K};

--- a/custom_ops/gpu_ops/get_img_boundaries.cc
+++ b/custom_ops/gpu_ops/get_img_boundaries.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
+                                             const paddle::Tensor& grid_thw,
+                                             const int64_t image_patch_id) {
+    // All tensor in cpu
+    auto input_ids_ptr = task_input_ids.data<int64_t>();
+    int64_t seq_lens_origin = task_input_ids.numel();
+    auto grid_thw_ptr = grid_thw.data<int64_t>();
+
+    int token_times = 4;
+    int token_idx = 0;
+    int image_idx = 0;
+    std::vector<int> img_boundaries, img_nums;
+    img_boundaries.emplace_back(0);
+    img_nums.emplace_back(0);
+    while (token_idx < seq_lens_origin) {
+        if (input_ids_ptr[token_idx] != image_patch_id) {
+            do {
+                token_idx++;
+            } while (token_idx < seq_lens_origin && input_ids_ptr[token_idx] != image_patch_id);
+        } else {
+            int cur_image_token_len = (grid_thw_ptr[image_idx * 3 + 1] * grid_thw_ptr[image_idx * 3 + 2]) / token_times;
+            image_idx++;
+            token_idx += cur_image_token_len;
+        }
+        img_boundaries.emplace_back(token_idx);
+        img_nums.emplace_back(image_idx);
+    }
+
+    int64_t num_img_boundaries = static_cast<int64_t>(img_boundaries.size());
+    auto out = paddle::full({2, num_img_boundaries}, 0, paddle::DataType::INT64, paddle::CPUPlace());
+
+    for (int i = 0; i < num_img_boundaries; i++) {
+        out.data<int64_t>()[i] = img_boundaries[i];
+        out.data<int64_t>()[num_img_boundaries + i] = img_nums[i];
+    }
+
+    return {out};
+}
+
+PD_BUILD_OP(get_img_boundaries)
+    .Inputs({"task_input_ids", "grid_thw"})
+    .Attrs({"image_patch_id: int64_t"})
+    .Outputs({"img_boundaries"})
+    .SetKernelFn(PD_KERNEL(GetImgBoundaries));
--- a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu
+++ b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu
@@ -665,10 +665,139 @@ void moe_fast_hardamard_kernel(const T *x,
  }
 }

+template <typename T, typename OutT, int kThreads, int kNBytes, int VecSize, int N,
+          int kNChunks, int kSmeSize, int kRounds, int kChunksPerSmemSize, bool UseDiagonalBlockMatrix = false>
+__global__ __launch_bounds__(kThreads)
+void masked_moe_fast_hardamard_kernel(const T *x,
+                               const int64_t *recv_expert_count,
+                               const T *shift,
+                               const T *smooth,
+                               const float* quant_scales,
+                               const int quant_round_type,
+                               const float quant_max_bound,
+                               const float quant_min_bound,
+                               const int64_t token_num,
+                               const int64_t dim,
+                               const int num_max_tokens_per_expert,
+                               OutT *out) {
+  using vec_t = typename BytesToType<sizeof(T) * VecSize>::Type;
+  constexpr int kLogVecSize = cilog2(VecSize);
+  constexpr int kLogWarpSize = cilog2(32);
+  constexpr int kWarpSize = 32;
+  constexpr int kNWarps = kThreads / kWarpSize;
+  constexpr int kLogNWarps = cilog2(kNWarps);
+  constexpr int kLogNChunks = cilog2(kNChunks);
+
+  extern __shared__ char smem_[];
+  vec_t *smem_exchange = reinterpret_cast<vec_t *>(smem_);
+
+  for (int token_id = blockIdx.x; token_id < token_num; token_id += gridDim.x) {
+    const auto token_idx_in_expert = token_id % num_max_tokens_per_expert;
+    const auto expert_id = token_id / num_max_tokens_per_expert;
+    if (token_idx_in_expert >= recv_expert_count[expert_id]) {
+        auto next_expert_start_idx = (expert_id + 1) * num_max_tokens_per_expert;
+        auto num_iters_to_next_expert = (next_expert_start_idx - token_id - 1) / gridDim.x;
+        token_id += num_iters_to_next_expert * gridDim.x;
+        continue;
+    }
+    const T *x_now = x + token_id * dim;
+    OutT *out_now = out + token_id * dim;
+    T init_value = static_cast<T>(0.f);
+    T x_vals[kNChunks][VecSize] = {init_value};
+
+    load_input<kNChunks, VecSize, UseDiagonalBlockMatrix, T>(x_now, x_vals, dim);
+#ifdef DEBUG_HARDAMARD
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      for (int i = 0; i < 1; ++i) {
+          printf("chunk_id0: %d\n", i);
+          for (int j = 0; j < VecSize; ++j) {
+              printf("%f ", (float)x_vals[i][j]);
+          }
+          printf("\n");
+      }
+    }
+    __syncthreads();
+#endif
+
+    hadamard_mult_thread<kLogVecSize, kNChunks>(x_vals);
+#ifdef DEBUG_HARDAMARD
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      for (int i = 0; i < 1; ++i) {
+          printf("chunk_id1: %d, kLogVecSize: %d\n", i, kLogVecSize);
+          for (int j = 0; j < VecSize; ++j) {
+              printf("%f ", (float)x_vals[i][j]);
+          }
+          printf("\n");
+      }
+    }
+    __syncthreads();
+#endif
+    hadamard_mult_warp<kLogWarpSize, 0, kNChunks, VecSize>(x_vals);
+#ifdef DEBUG_HARDAMARD
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      for (int i = 0; i < 1; ++i) {
+          printf("chunk_id2: %d\n", i);
+          for (int j = 0; j < VecSize; ++j) {
+              printf("%f ", (float)x_vals[i][j]);
+          }
+          printf("\n");
+      }
+    }
+    __syncthreads();
+#endif
+    if constexpr (kNWarps > 1) {
+        // 先让连续的NWARPS个线程拿到其余warps上的数据
+        exchange_smem_pre<kNChunks, kChunksPerSmemSize, VecSize, kWarpSize, kNWarps, true, vec_t>(x_vals, smem_exchange);
+        // 交叉计算
+        hadamard_mult_warp<kLogNWarps, 0, kNChunks, VecSize>(x_vals);
+        // 再换回来
+        exchange_smem_pre<kNChunks, kChunksPerSmemSize, VecSize, kWarpSize, kNWarps, false, vec_t>(x_vals, smem_exchange);
+    }
+    if constexpr (kNChunks > 1) {
+      if constexpr (kNChunks == 28) {
+        hadamard_mult_thread_28_transpose<T, VecSize>(x_vals);
+      } else if constexpr (kNChunks == 36) {
+        hadamard_mult_thread_36_transpose<T, VecSize>(x_vals);
+      } else {
+        constexpr int kLogNChunks = cilog2(kNChunks);
+        static_assert(1 << kLogNChunks == kNChunks, "kNChunks must be a power of 2");
+        hadamard_mult_thread_transpose<kLogNChunks, VecSize>(x_vals);
+      }
+    }
+    if (quant_scales) {
+      float quant_scale = quant_scales[expert_id];
+      if (shift) {
+        smooth_quant_store_output<kNChunks, VecSize, UseDiagonalBlockMatrix, T, OutT>(
+          out_now,
+          shift,
+          smooth,
+          x_vals,
+          quant_scale,
+          quant_round_type,
+          quant_max_bound,
+          quant_min_bound,
+          dim);
+      } else {
+        quant_store_output<kNChunks, VecSize, UseDiagonalBlockMatrix, T, OutT>(
+          out_now,
+          x_vals,
+          quant_scale,
+          quant_round_type,
+          quant_max_bound,
+          quant_min_bound,
+          dim);
+      }
+    } else {
+      store_output<kNChunks, VecSize, UseDiagonalBlockMatrix, T>(out_now, x_vals, dim);
+    }
+  }
+}
+

 template <typename T, typename OutT, int kLogN, int VecSize, int kNChunks, int kThreads, bool UseDiagonalBlockMatrix>
 void MoeFastHardamardImplWrapper(const T *x,
                              const int64_t *expert_idx_per_token,
+                              const int64_t *recv_expert_count,
                              const T *shift,
                              const T *smooth,
                              const float* quant_scales,
@@ -677,6 +806,8 @@ void MoeFastHardamardImplWrapper(const T *x,
                              const float quant_min_bound,
                              const int64_t token_num,
                              const int64_t dim,
+                              const int num_max_tokens_per_expert,
+                              bool used_in_ep_low_latency,
                              OutT* out,
                             cudaStream_t stream) {
  using nv_type = typename nv_type_traits<T>::type;
@@ -696,34 +827,61 @@ void MoeFastHardamardImplWrapper(const T *x,
  int sm_count;
  int act_blocks_per_sm;
  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
-  auto kernel = moe_fast_hardamard_kernel<nv_type, out_type, kThreads, kNBytes, VecSize, N, kNChunks, kSmemSize, kRounds, kChunksPerSmemSize, UseDiagonalBlockMatrix>;
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &act_blocks_per_sm, kernel, kThreads, kSmemSize);
-  const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
-  dim3 grid;
-  grid.x = min(static_cast<int64_t>(num_blocks_per_wave), token_num);
-  if constexpr (UseDiagonalBlockMatrix) {
-    grid.y = ceil(dim / (kThreads * VecSize));
+
+  if (used_in_ep_low_latency) {
+    auto masked_kernel = masked_moe_fast_hardamard_kernel<nv_type, out_type, kThreads, kNBytes, VecSize, N, kNChunks, kSmemSize, kRounds, kChunksPerSmemSize, UseDiagonalBlockMatrix>;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &act_blocks_per_sm, masked_kernel, kThreads, kSmemSize);
+    const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
+    dim3 grid;
+    grid.x = min(static_cast<int64_t>(num_blocks_per_wave), token_num);
+    if constexpr (UseDiagonalBlockMatrix) {
+      grid.y = ceil(dim / (kThreads * VecSize));
+    }
+    masked_kernel<<<grid, kThreads, kSmemSize, stream>>>(
+      reinterpret_cast<const nv_type*>(x),
+      recv_expert_count,
+      reinterpret_cast<const nv_type*>(shift),
+      reinterpret_cast<const nv_type*>(smooth),
+      quant_scales,
+      quant_round_type,
+      quant_max_bound,
+      quant_min_bound,
+      token_num,
+      dim,
+      num_max_tokens_per_expert,
+      reinterpret_cast<out_type*>(out)
+    );
+  } else {
+    auto kernel = moe_fast_hardamard_kernel<nv_type, out_type, kThreads, kNBytes, VecSize, N, kNChunks, kSmemSize, kRounds, kChunksPerSmemSize, UseDiagonalBlockMatrix>;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &act_blocks_per_sm, kernel, kThreads, kSmemSize);
+    const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
+    dim3 grid;
+    grid.x = min(static_cast<int64_t>(num_blocks_per_wave), token_num);
+    if constexpr (UseDiagonalBlockMatrix) {
+      grid.y = ceil(dim / (kThreads * VecSize));
+    }
+    kernel<<<grid, kThreads, kSmemSize, stream>>>(
+      reinterpret_cast<const nv_type*>(x),
+      expert_idx_per_token,
+      reinterpret_cast<const nv_type*>(shift),
+      reinterpret_cast<const nv_type*>(smooth),
+      quant_scales,
+      quant_round_type,
+      quant_max_bound,
+      quant_min_bound,
+      token_num,
+      dim,
+      reinterpret_cast<out_type*>(out)
+    );
  }
-  kernel<<<grid, kThreads, kSmemSize, stream>>>(
-    reinterpret_cast<const nv_type*>(x),
-    expert_idx_per_token,
-    reinterpret_cast<const nv_type*>(shift),
-    reinterpret_cast<const nv_type*>(smooth),
-    quant_scales,
-    quant_round_type,
-    quant_max_bound,
-    quant_min_bound,
-    token_num,
-    dim,
-    reinterpret_cast<out_type*>(out)
-  );
-  CUDA_CHECK(cudaDeviceSynchronize());
 }

 template <typename T, typename OutT>
 void MoeFastHardamardWrapper(const T *x_data,
                          const int64_t *expert_idx_per_token,
+                          const int64_t *recv_expert_count,
                          const T *shift,
                          const T *smooth,
                          const float* quant_scales,
@@ -732,6 +890,8 @@ void MoeFastHardamardWrapper(const T *x_data,
                          const float quant_min_bound,
                          const int64_t token_num,
                          const int64_t dim,
+                          const int num_max_tokens_per_expert,
+                          bool used_in_ep_low_latency,
                          OutT* out,
                          cudaStream_t &stream) {
  bool FLAGS_hardamard_use_diagonal_block_matrix = true;
@@ -749,6 +909,7 @@ void MoeFastHardamardWrapper(const T *x_data,
        MoeFastHardamardImplWrapper<T, OutT, kLogN, VEC_SIZE, kNChunks, kThreads, true>(
          x_data,
          expert_idx_per_token,
+          recv_expert_count,
          shift,
          smooth,
          quant_scales,
@@ -757,6 +918,8 @@ void MoeFastHardamardWrapper(const T *x_data,
          quant_min_bound,
          token_num,
          dim,
+          num_max_tokens_per_expert,
+          used_in_ep_low_latency,
          out,
          stream);
      })});
@@ -770,6 +933,7 @@ void MoeFastHardamardWrapper(const T *x_data,
        MoeFastHardamardImplWrapper<T, OutT, kLogN, VecSize, kNChunks, kThreads, false>(
          x_data,
          expert_idx_per_token,
+          recv_expert_count,
          shift,
          smooth,
          quant_scales,
@@ -778,6 +942,8 @@ void MoeFastHardamardWrapper(const T *x_data,
          quant_min_bound,
          token_num,
          dim,
+          num_max_tokens_per_expert,
+          used_in_ep_low_latency,
          out,
          stream);
      });
@@ -790,6 +956,7 @@ void MoeFastHardamardWrapper(const T *x_data,
        MoeFastHardamardImplWrapper<T, OutT, kLogN, VecSize, kNChunks, kThreads, false>(
          x_data,
          expert_idx_per_token,
+          recv_expert_count,
          shift,
          smooth,
          quant_scales,
@@ -798,6 +965,8 @@ void MoeFastHardamardWrapper(const T *x_data,
          quant_min_bound,
          token_num,
          dim,
+          num_max_tokens_per_expert,
+          used_in_ep_low_latency,
          out,
          stream);
      });
@@ -810,6 +979,7 @@ void MoeFastHardamardWrapper(const T *x_data,
        MoeFastHardamardImplWrapper<T, OutT, kLogN, VecSize, kNChunks, kThreads, false>(
          x_data,
          expert_idx_per_token,
+          recv_expert_count,
          shift,
          smooth,
          quant_scales,
@@ -818,6 +988,8 @@ void MoeFastHardamardWrapper(const T *x_data,
          quant_min_bound,
          token_num,
          dim,
+          num_max_tokens_per_expert,
+          used_in_ep_low_latency,
          out,
          stream);
      });
@@ -828,6 +1000,7 @@ void MoeFastHardamardWrapper(const T *x_data,
 template void MoeFastHardamardWrapper<phi::dtype::float16, phi::dtype::float16>(
  const phi::dtype::float16 *x_data,
  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
  const phi::dtype::float16 *shift,
  const phi::dtype::float16 *smooth,
  const float* quant_scales,
@@ -836,6 +1009,8 @@ template void MoeFastHardamardWrapper<phi::dtype::float16, phi::dtype::float16>(
  const float quant_min_bound,
  const int64_t token_num,
  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
  phi::dtype::float16 *out,
  cudaStream_t &stream
 );
@@ -843,6 +1018,7 @@ template void MoeFastHardamardWrapper<phi::dtype::float16, phi::dtype::float16>(
 template void MoeFastHardamardWrapper<phi::dtype::float16, int8_t>(
  const phi::dtype::float16 *x_data,
  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
  const phi::dtype::float16 *shift,
  const phi::dtype::float16 *smooth,
  const float* quant_scales,
@@ -851,6 +1027,8 @@ template void MoeFastHardamardWrapper<phi::dtype::float16, int8_t>(
  const float quant_min_bound,
  const int64_t token_num,
  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
  int8_t *out,
  cudaStream_t &stream
 );
@@ -858,6 +1036,7 @@ template void MoeFastHardamardWrapper<phi::dtype::float16, int8_t>(
 template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::bfloat16>(
  const phi::dtype::bfloat16 *x_data,
  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
  const phi::dtype::bfloat16 *shift,
  const phi::dtype::bfloat16 *smooth,
  const float* quant_scales,
@@ -866,6 +1045,8 @@ template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::bfloat16
  const float quant_min_bound,
  const int64_t token_num,
  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
  phi::dtype::bfloat16 *out,
  cudaStream_t &stream
 );
@@ -873,6 +1054,7 @@ template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::bfloat16
 template void MoeFastHardamardWrapper<phi::dtype::bfloat16, int8_t>(
  const phi::dtype::bfloat16 *x_data,
  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
  const phi::dtype::bfloat16 *shift,
  const phi::dtype::bfloat16 *smooth,
  const float* quant_scales,
@@ -881,6 +1063,8 @@ template void MoeFastHardamardWrapper<phi::dtype::bfloat16, int8_t>(
  const float quant_min_bound,
  const int64_t token_num,
  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
  int8_t *out,
  cudaStream_t &stream
 );
--- a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.h
+++ b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.h
@@ -21,6 +21,7 @@
 template <typename T, typename OutT>
 void MoeFastHardamardWrapper(const T *x_data,
                            const int64_t *expert_idx_per_token,
+                            const int64_t *recv_expert_count,
                            const T *shift,
                            const T *smooth,
                            const float* quant_scales,
@@ -29,5 +30,7 @@ void MoeFastHardamardWrapper(const T *x_data,
                            const float quant_min_bound,
                            const int64_t token_num,
                            const int64_t dim,
+                            const int num_max_tokens_per_expert,
+                            bool used_in_ep_low_latency,
                            OutT* out,
                            cudaStream_t &stream);
--- a/custom_ops/gpu_ops/moe/moe_ffn.cu
+++ b/custom_ops/gpu_ops/moe/moe_ffn.cu
@@ -240,6 +240,7 @@ void MoeFFNKernel(const paddle::Tensor& permute_input,
        MoeFastHardamardWrapper<data_t, int8_t>(
            act_out_tensor.data<data_t>(),
            expert_idx_per_token ? expert_idx_per_token.get().data<int64_t>() : nullptr,
+            const_cast<int64_t*>(tokens_expert_prefix_sum.data<int64_t>()),
            down_proj_shift, // down_proj_shift->data<T>(),
            down_proj_smooth, // down_proj_smooth->data<T>(),
            down_proj_in_scale ? const_cast<paddle::Tensor*>(down_proj_in_scale.get_ptr())->data<float>() : nullptr,
@@ -248,6 +249,8 @@ void MoeFFNKernel(const paddle::Tensor& permute_input,
            -127.0,
            expanded_active_expert_rows,
            inter_size / 2,
+            num_max_tokens_per_expert,
+            used_in_ep_low_latency,
            reinterpret_cast<int8_t *>(int8_act_out->ptr()),
            stream
        );
--- a/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu
+++ b/custom_ops/gpu_ops/moe/moe_ffn_wint2.cu
@@ -49,12 +49,13 @@ void WeightOnlyMoeFFNKernel(const paddle::Tensor& permute_input,
    typename WeightOnlyTraits::Arguments up_gate_proj_quant_args;
    typename WeightOnlyTraits::Arguments down_proj_quant_args;
    if constexpr (QuantMethod == cutlass::WintQuantMethod::kWeightOnlyInt2) {
-        up_gate_proj_quant_args.local_scale_ptr = up_gate_proj_local_scale->data<uint8_t>();
-        up_gate_proj_quant_args.code_scale_ptr = up_gate_proj_code_scale->data<float>();
-        up_gate_proj_quant_args.code_zp_ptr = up_gate_proj_code_zp->data<float>();
-        down_proj_quant_args.local_scale_ptr = down_proj_local_scale->data<uint8_t>();
-        down_proj_quant_args.code_scale_ptr = down_proj_code_scale->data<float>();
-        down_proj_quant_args.code_zp_ptr = down_proj_code_zp->data<float>();
+        up_gate_proj_quant_args.local_scale_ptr = const_cast<uint8_t*>(up_gate_proj_local_scale->data<uint8_t>());
+        up_gate_proj_quant_args.code_scale_ptr = const_cast<float*>(up_gate_proj_code_scale->data<float>());
+        up_gate_proj_quant_args.code_zp_ptr = const_cast<float*>(up_gate_proj_code_zp->data<float>());
+
+        down_proj_quant_args.local_scale_ptr = const_cast<uint8_t*>(down_proj_local_scale->data<uint8_t>());
+        down_proj_quant_args.code_scale_ptr = const_cast<float*>(down_proj_code_scale->data<float>());
+        down_proj_quant_args.code_zp_ptr = const_cast<float*>(down_proj_code_zp->data<float>());
    }

    auto moe_gemm_runner = MoeGemmRunner<NvType, WeightOnlyTraits>();
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_token_penalty_multi_scores.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_token_penalty_multi_scores.cu
@@ -180,7 +180,7 @@ void token_penalty_multi_scores_kernel(
    int64_t token_num = shape[0];
    int64_t length = shape[1];
    int64_t length_id = pre_ids.shape()[1];
-    int64_t length_bad_words = bad_tokens.shape()[0];
+    int64_t length_bad_words = bad_tokens.shape()[1];

    int64_t end_length = eos_token_id.shape()[0];

--- a/custom_ops/gpu_ops/stop_generation_multi_ends.cu
+++ b/custom_ops/gpu_ops/stop_generation_multi_ends.cu
@@ -30,30 +30,62 @@ __global__ void set_value_by_flags(bool *stop_flags,
                                   const int *seq_lens,
                                   const int bs,
                                   const int end_length,
+                                   const int64_t *pre_ids,
+                                   const int pre_ids_len,
+                                   const int64_t *step_idx,
+                                   const int64_t *stop_seqs,
+                                   const int *stop_seqs_len,
+                                   const int stop_seqs_bs,
+                                   const int stop_seqs_max_len,
                                   bool beam_search,
                                   bool prefill_one_step_stop) {
    int tid = threadIdx.x;
-    if (tid < bs) {
-        if (prefill_one_step_stop) {
-            stop_flags[tid] = true;
-            if (seq_lens[tid] == 0) {
-                topk_ids[tid] = -1;
-            }
-            next_tokens[tid] = topk_ids[tid];
-        } else {
-            if (stop_flags[tid]) {
-                if (seq_lens[tid] == 0) {
-                    topk_ids[tid] = -1;
-                } else {
-                    topk_ids[tid] = end_ids[0];
-                    next_tokens[tid] = end_ids[0];
+    int bid = blockIdx.x;
+    if (tid >= stop_seqs_bs) return;
+    if (bid < bs) {
+        if(tid == 0){
+            if (prefill_one_step_stop) {
+                stop_flags[bid] = true;
+                if (seq_lens[bid] == 0) {
+                    topk_ids[bid] = -1;
                }
+                next_tokens[bid] = topk_ids[bid];
            } else {
-                next_tokens[tid] = topk_ids[tid];
+                if (stop_flags[bid]) {
+                    if (seq_lens[bid] == 0) {
+                        topk_ids[bid] = -1;
+                    } else {
+                        topk_ids[bid] = end_ids[0];
+                        next_tokens[bid] = end_ids[0];
+                    }
+                } else {
+                    next_tokens[bid] = topk_ids[bid];
+                }
+            }
+            if (!beam_search && is_in_end(topk_ids[bid], end_ids, end_length)) {
+                stop_flags[bid] = true;
            }
        }
-        if (!beam_search && is_in_end(topk_ids[tid], end_ids, end_length)) {
-            stop_flags[tid] = true;
+        // dealing stop_seqs
+        const int stop_seq_len = (stop_seqs_len + bid * stop_seqs_bs)[tid];
+        if (stop_seq_len <= 0) return;
+        const int64_t *stop_seq_now = stop_seqs + bid * stop_seqs_bs + tid * stop_seqs_max_len;
+        const int64_t *pre_ids_now = pre_ids + bid * pre_ids_len;
+        const int64_t step_idx_now = step_idx[bid];
+
+        bool is_end = true;
+        int count = 1;
+        for (int i = stop_seq_len - 1; i >= 0; --i) {
+            if ((step_idx_now - count) < 0 ||
+                pre_ids_now[step_idx_now - count++] != stop_seq_now[i]) {
+                is_end = false;
+                break;
+            }
+        }
+        if (is_end) {
+            next_tokens[bid] = end_ids[0];
+            stop_flags[bid] = true;
+            topk_ids[bid] = end_ids[0];
        }
    }
 }
@@ -63,6 +95,10 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
                       const paddle::Tensor &seq_lens,
                       const paddle::Tensor &end_ids,
                       const paddle::Tensor &next_tokens,
+                       const paddle::Tensor &pre_ids,
+                       const paddle::Tensor &step_idx,
+                       const paddle::Tensor &stop_seqs,
+                       const paddle::Tensor &stop_seqs_len,
                       const bool beam_search) {
    PD_CHECK(topk_ids.dtype() == paddle::DataType::INT64);
    PD_CHECK(stop_flags.dtype() == paddle::DataType::BOOL);
@@ -83,8 +119,10 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
    std::vector<int64_t> shape = topk_ids.shape();
    int64_t bs_now = shape[0];
    int64_t end_length = end_ids.shape()[0];
-    int block_size = (bs_now + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
-    set_value_by_flags<<<1, block_size, 0, cu_stream>>>(
+    int stop_seqs_bs = stop_seqs.shape()[1];
+    int stop_seqs_max_len = stop_seqs.shape()[2];
+    int block_size = (stop_seqs_bs + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
+    set_value_by_flags<<<bs_now, block_size, 0, cu_stream>>>(
        const_cast<bool *>(stop_flags.data<bool>()),
        const_cast<int64_t *>(topk_ids.data<int64_t>()),
        const_cast<int64_t *>(next_tokens.data<int64_t>()),
@@ -92,12 +130,19 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
        seq_lens.data<int>(),
        bs_now,
        end_length,
+        pre_ids.data<int64_t>(),
+        pre_ids.shape()[1],
+        step_idx.data<int64_t>(),
+        stop_seqs.data<int64_t>(),
+        stop_seqs_len.data<int>(),
+        stop_seqs_bs,
+        stop_seqs_max_len,
        beam_search,
        prefill_one_step_stop);
 }

 PD_BUILD_STATIC_OP(set_stop_value_multi_ends)
-    .Inputs({"topk_ids", "stop_flags", "seq_lens", "end_ids", "next_tokens"})
+    .Inputs({"topk_ids", "stop_flags", "seq_lens", "end_ids", "next_tokens", "pre_ids", "step_idx", "stop_seqs", "stop_seqs_len"})
    .Attrs({"beam_search: bool"})
    .Outputs({"topk_ids_out", "stop_flags_out", "next_tokens_out"})
    .SetInplaceMap({{"topk_ids", "topk_ids_out"},
--- a/custom_ops/gpu_ops/stop_generation_multi_stop_seqs.cu
+++ b/custom_ops/gpu_ops/stop_generation_multi_stop_seqs.cu
@@ -1,133 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "paddle/extension.h"
-#include "helper.h"
-
-#ifndef PD_BUILD_STATIC_OP
-#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
-#endif
-
-__global__ void set_value_by_stop_seqs(bool *stop_flags,
-                                       int64_t *topk_ids,
-                                       const int64_t *pre_ids,
-                                       const int64_t *step_idx,
-                                       const int64_t *stop_seqs,
-                                       const int *stop_seqs_len,
-                                       const int *seq_lens,
-                                       const int64_t *end_ids,
-                                       const int bs,
-                                       const int stop_seqs_bs,
-                                       const int stop_seqs_max_len,
-                                       const int pre_ids_len) {
-    const int bid = blockIdx.x;
-    const int tid = threadIdx.x;
-    if (tid >= stop_seqs_bs) return;
-
-    const int stop_seq_len = stop_seqs_len[tid];
-    if (stop_seq_len <= 0) return;
-    const int64_t *stop_seq_now = stop_seqs + tid * stop_seqs_max_len;
-    const int64_t *pre_ids_now = pre_ids + bid * pre_ids_len;
-    const int64_t step_idx_now = step_idx[bid];
-    if (bid < bs) {
-        if (stop_flags[bid]) {  // 长度超限，当前位置置为2
-            topk_ids[bid] = end_ids[0];
-            if (seq_lens[bid] == 0) {  // 已终止，当前位置置为-1
-                topk_ids[bid] = -1;
-            }
-            return;
-        }
-        bool is_end = true;
-        int count = 1;
-        if (topk_ids[bid] == end_ids[0]) {
-            if (tid == 0) {
-                stop_flags[bid] = true;
-            }
-            return;
-        }
-        for (int i = stop_seq_len - 1; i >= 0; --i) {
-            if ((step_idx_now - count) < 0 ||
-                pre_ids_now[step_idx_now - count++] != stop_seq_now[i]) {
-                is_end = false;
-                break;
-            }
-        }
-        if (is_end) {
-            topk_ids[bid] = end_ids[0];
-            stop_flags[bid] = true;
-        }
-    }
-}
-
-void GetStopFlagsMultiSeqs(const paddle::Tensor &topk_ids,
-                           const paddle::Tensor &pre_ids,
-                           const paddle::Tensor &step_idx,
-                           const paddle::Tensor &stop_flags,
-                           const paddle::Tensor &seq_lens,
-                           const paddle::Tensor &stop_seqs,
-                           const paddle::Tensor &stop_seqs_len,
-                           const paddle::Tensor &end_ids) {
-    PD_CHECK(topk_ids.dtype() == paddle::DataType::INT64);
-    PD_CHECK(stop_flags.dtype() == paddle::DataType::BOOL);
-
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));
-    auto cu_stream = dev_ctx->stream();
-#else
-    auto cu_stream = topk_ids.stream();
-#endif
-    std::vector<int64_t> shape = topk_ids.shape();
-    std::vector<int64_t> stop_seqs_shape = stop_seqs.shape();
-    int bs_now = shape[0];
-    int stop_seqs_bs = stop_seqs_shape[0];
-    int stop_seqs_max_len = stop_seqs_shape[1];
-    int pre_ids_len = pre_ids.shape()[1];
-
-    int block_size = (stop_seqs_bs + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
-    set_value_by_stop_seqs<<<bs_now, block_size, 0, cu_stream>>>(
-        const_cast<bool *>(stop_flags.data<bool>()),
-        const_cast<int64_t *>(topk_ids.data<int64_t>()),
-        pre_ids.data<int64_t>(),
-        step_idx.data<int64_t>(),
-        stop_seqs.data<int64_t>(),
-        stop_seqs_len.data<int>(),
-        seq_lens.data<int>(),
-        end_ids.data<int64_t>(),
-        bs_now,
-        stop_seqs_bs,
-        stop_seqs_max_len,
-        pre_ids_len);
-}
-
-PD_BUILD_STATIC_OP(set_stop_value_multi_seqs)
-    .Inputs({"topk_ids",
-             "pre_ids",
-             "step_idx",
-             "stop_flags",
-             "seq_lens",
-             "stop_seqs",
-             "stop_seqs_len",
-             "end_ids"})
-    .Outputs({"topk_ids_out", "stop_flags_out"})
-    .SetInplaceMap({{"topk_ids", "topk_ids_out"},
-                    {"stop_flags", "stop_flags_out"}})
-    .SetKernelFn(PD_KERNEL(GetStopFlagsMultiSeqs));
--- a/custom_ops/gpu_ops/token_penalty_multi_scores.cu
+++ b/custom_ops/gpu_ops/token_penalty_multi_scores.cu
@@ -171,7 +171,7 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,

    int64_t vocab_size = shape[1];
    int64_t max_dec_len = pre_ids.shape()[1];
-    int64_t bad_words_len = bad_tokens.shape()[0];
+    int64_t bad_words_len = bad_tokens.shape()[1];
    int64_t eos_len = eos_token_id.shape()[0];
    int64_t max_model_len = prompt_ids.shape()[1];

--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -256,11 +256,11 @@ elif paddle.is_compiled_with_cuda():
        "gpu_ops/gather_idx.cu",
        "gpu_ops/get_output_ep.cc",
        "gpu_ops/get_mm_split_fuse.cc",
+        "gpu_ops/get_img_boundaries.cc",
        "gpu_ops/token_penalty_multi_scores.cu",
        "gpu_ops/token_penalty_only_once.cu",
        "gpu_ops/stop_generation.cu",
        "gpu_ops/stop_generation_multi_ends.cu",
-        "gpu_ops/stop_generation_multi_stop_seqs.cu",
        "gpu_ops/set_flags.cu",
        "gpu_ops/update_inputs_v1.cu",
        "gpu_ops/recover_decode_task.cu",
@@ -529,7 +529,6 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
            sources=[
                "gpu_ops/get_padding_offset.cu",
                "gpu_ops/set_value_by_flags.cu",
-                "gpu_ops/stop_generation_multi_stop_seqs.cu",
                "gpu_ops/rebuild_padding.cu",
                "gpu_ops/update_inputs.cu",
                "gpu_ops/stop_generation_multi_ends.cu",
--- a/custom_ops/xpu_ops/src/ops/recover_decode_task.cc
+++ b/custom_ops/xpu_ops/src/ops/recover_decode_task.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "paddle/phi/core/enforce.h"
+#include "xpu/plugin.h"
+
+void RecoverDecodeTask(const paddle::Tensor &stop_flags,
+                   const paddle::Tensor &seq_lens_this_time,
+                   const paddle::Tensor &seq_lens_encoder,
+                   const paddle::Tensor &seq_lens_decoder,
+                   const paddle::Tensor &step_seq_lens_decoder,
+                   const paddle::Tensor &block_tables,
+                   const paddle::Tensor &is_block_step,
+                   const int block_size) {
+phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+    auto dev_ctx =
+        paddle::experimental::DeviceContextPool::Instance().Get(place);
+    auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
+    const int bsz = seq_lens_this_time.shape()[0];
+    const int block_num_per_seq = block_tables.shape()[1];
+    int r = baidu::xpu::api::plugin::recover_decode_task(
+        xpu_ctx->x_context(),
+        const_cast<bool *>(stop_flags.data<bool>()),
+        const_cast<int *>(seq_lens_this_time.data<int>()),
+        const_cast<int *>(seq_lens_encoder.data<int>()),
+        const_cast<int *>(seq_lens_decoder.data<int>()),
+        const_cast<int *>(step_seq_lens_decoder.data<int>()),
+        const_cast<int *>(block_tables.data<int>()),
+        const_cast<bool *>(is_block_step.data<bool>()),
+        bsz,
+        block_num_per_seq,
+        block_size);
+    PD_CHECK(r == 0, "baidu::xpu::api::plugin::recover_decode_task failed.");
+}
+
+PD_BUILD_OP(recover_decode_task)
+    .Inputs({"stop_flags",
+             "seq_lens_this_time",
+             "seq_lens_encoder",
+             "seq_lens_decoder",
+             "step_seq_lens_decoder",
+             "block_tables",
+             "is_block_step"})
+    .Attrs({"block_size: int"})
+    .Outputs({"seq_lens_this_time_out",
+              "seq_lens_encoder_out",
+              "seq_lens_decoder_out",
+              "stop_flags_out",
+              "is_block_step_out"})
+    .SetInplaceMap({{"seq_lens_this_time", "seq_lens_this_time_out"},
+                    {"seq_lens_encoder", "seq_lens_encoder_out"},
+                    {"seq_lens_decoder", "seq_lens_decoder_out"},
+                    {"stop_flags", "stop_flags_out"},
+                    {"is_block_step", "is_block_step_out"}})
+    .SetKernelFn(PD_KERNEL(RecoverDecodeTask));
--- a/custom_ops/xpu_ops/src/ops/update_inputs_v1.cc
+++ b/custom_ops/xpu_ops/src/ops/update_inputs_v1.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/phi/backends/xpu/xpu_context.h>
+#include "paddle/extension.h"
+#include "paddle/phi/core/enforce.h"
+#include "xpu/plugin.h"
+
+void UpdateInputesV1(const paddle::Tensor &stop_flags,
+                   const paddle::Tensor &not_need_stop,  // only on cpu
+                   const paddle::Tensor &seq_lens_this_time,
+                   const paddle::Tensor &seq_lens_encoder,
+                   const paddle::Tensor &seq_lens_decoder,
+                   const paddle::Tensor &step_seq_lens_decoder,
+                   const paddle::Tensor &prompt_lens,
+                   const paddle::Tensor &topk_ids,
+                   const paddle::Tensor &input_ids,
+                   const paddle::Tensor &block_tables,
+                   const paddle::Tensor &stop_nums,
+                   const paddle::Tensor &next_tokens,
+                   const paddle::Tensor &is_block_step,
+                   const int block_size) {
+    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
+    auto dev_ctx =
+        paddle::experimental::DeviceContextPool::Instance().Get(place);
+    auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
+
+    const int max_bsz = stop_flags.shape()[0];
+    const int now_bsz = seq_lens_this_time.shape()[0];
+    // std::cout << "now_bsz: " << now_bsz << std::endl;
+    const int input_ids_stride = input_ids.shape()[1];
+    const int block_num_per_seq = block_tables.shape()[1];
+    auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
+    int r = baidu::xpu::api::plugin::update_inputs_v1(
+        xpu_ctx->x_context(),
+        const_cast<bool *>(not_need_stop_gpu.data<bool>()),
+        const_cast<int *>(seq_lens_this_time.data<int>()),
+        const_cast<int *>(seq_lens_encoder.data<int>()),
+        const_cast<int *>(seq_lens_decoder.data<int>()),
+        const_cast<int *>(step_seq_lens_decoder.data<int>()),
+        const_cast<int64_t *>(prompt_lens.data<int64_t>()),
+        const_cast<int64_t *>(topk_ids.data<int64_t>()),
+        const_cast<int64_t *>(input_ids.data<int64_t>()),
+        const_cast<int *>(block_tables.data<int>()),
+        stop_nums.data<int64_t>(),
+        const_cast<bool *>(stop_flags.data<bool>()),
+        const_cast<bool *>(is_block_step.data<bool>()),
+        next_tokens.data<int64_t>(),
+        now_bsz,
+        max_bsz,
+        input_ids_stride,
+        block_num_per_seq,
+        block_size);
+    PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs_kernel_v1 failed.");
+    auto not_need_stop_cpu =
+        not_need_stop_gpu.copy_to(not_need_stop.place(), false);
+    bool *not_need_stop_data = const_cast<bool *>(not_need_stop.data<bool>());
+    not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
+}
+
+PD_BUILD_OP(update_inputs_v1)
+    .Inputs({"stop_flags",
+             "not_need_stop",
+             "seq_lens_this_time",
+             "seq_lens_encoder",
+             "seq_lens_decoder",
+             "step_seq_lens_decoder",
+             "prompt_lens",
+             "topk_ids",
+             "input_ids",
+             "block_tables",
+             "stop_nums",
+             "next_tokens",
+             "is_block_step"})
+    .Attrs({"block_size: int"})
+    .Outputs({"not_need_stop_out",
+              "seq_lens_this_time_out",
+              "seq_lens_encoder_out",
+              "seq_lens_decoder_out",
+              "step_seq_lens_decoder_out",
+              "topk_ids_out",
+              "input_ids_out",
+              "stop_flags_out",
+              "is_block_step_out"})
+    .SetInplaceMap({{"not_need_stop", "not_need_stop_out"},
+                    {"seq_lens_this_time", "seq_lens_this_time_out"},
+                    {"seq_lens_encoder", "seq_lens_encoder_out"},
+                    {"seq_lens_decoder", "seq_lens_decoder_out"},
+                    {"topk_ids", "topk_ids_out"},
+                    {"input_ids", "input_ids_out"},
+                    {"stop_flags", "stop_flags_out"},
+                    {"step_seq_lens_decoder", "step_seq_lens_decoder_out"},
+                    {"is_block_step", "is_block_step_out"}})
+    .SetKernelFn(PD_KERNEL(UpdateInputesV1));
--- a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
+++ b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
@@ -86,6 +86,39 @@ recover_block(Context *ctx,
              const int block_num_per_seq, const int length,
              const int pre_id_length);

+
+DLL_EXPORT int
+recover_decode_task(Context *ctx, bool *stop_flags,
+                                   int *seq_lens_this_time,
+                                   int *seq_lens_encoder,
+                                   int *seq_lens_decoder,
+                                   int *step_seq_lens_decoder,
+                                   int *block_tables,
+                                   bool *is_block_step,
+                                   const int bsz,
+                                   const int block_num_per_seq,
+                                   const int block_size);
+
+DLL_EXPORT int
+update_inputs_v1(Context *ctx, bool *not_need_stop,
+                                     int *seq_lens_this_time,
+                                     int *seq_lens_encoder,
+                                     int *seq_lens_decoder,
+                                     int *step_seq_lens_decoder,
+                                     int64_t *prompt_lens,
+                                     int64_t *topk_ids,
+                                     int64_t *input_ids,
+                                     int *block_tables,
+                                     const int64_t *stop_nums,
+                                     bool *stop_flags,
+                                     bool *is_block_step,
+                                     const int64_t *next_tokens,
+                                     const int bsz,
+                                     const int max_bsz,
+                                     const int input_ids_stride,
+                                     const int block_num_per_seq,
+                                     const int block_size);
+
 template <typename TX, typename TY>
 DLL_EXPORT int
 eb_adjust_batch(Context *ctx, const TX *x, TY *y,
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/recover_decode_task.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/recover_decode_task.xpu
@@ -0,0 +1,41 @@
+#include "xpu/kernel/cluster.h"
+#include "xpu/kernel/cluster_partition.h"
+#include "xpu/kernel/cluster_primitive.h"
+
+namespace xpu3 {
+namespace plugin {
+
+__global__ void recover_decode_task(bool *stop_flags,
+                                   int *seq_lens_this_time,
+                                   int *seq_lens_encoder,
+                                   int *seq_lens_decoder,
+                                   int *step_seq_lens_decoder,
+                                   int *block_tables,
+                                   bool *is_block_step,
+                                   const int bsz,
+                                   const int block_num_per_seq,
+                                   const int block_size) {
+  int cid = core_id();
+  int ncores = core_num();
+  int clusterid = cluster_id();
+  int nclusters = cluster_num();
+  int thread_idx = clusterid * ncores + cid;
+  int nthreads = nclusters * ncores;
+  // if (clusterid != 0) return;
+  for (; thread_idx < bsz; thread_idx += nthreads) {
+    if(is_block_step[thread_idx] == true) {
+      // int *block_table_now = block_tables + thread_idx * block_num_per_seq;
+      if (block_tables[thread_idx * block_num_per_seq + step_seq_lens_decoder[thread_idx] / block_size] != -1) {
+          // can be recovered for decoding
+          is_block_step[thread_idx] = false;
+          seq_lens_this_time[thread_idx]= 1;
+          stop_flags[thread_idx] = false;
+          seq_lens_encoder[thread_idx] = 0;
+          seq_lens_decoder[thread_idx] = step_seq_lens_decoder[thread_idx];
+      }
+    }
+  }
+}
+
+}  // namespace plugin
+}  // namespace xpu3
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/update_inputs_v1.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/update_inputs_v1.xpu
@@ -0,0 +1,131 @@
+#include "xpu/kernel/cluster.h"
+#include "xpu/kernel/cluster_partition.h"
+#include "xpu/kernel/cluster_primitive.h"
+// #include <stdio.h>
+// using namespace std;
+
+#include "xpu/kernel/xtdk_io.h"
+#include "xpu/kernel/xtdk.h"
+
+namespace xpu3 {
+namespace plugin {
+
+__global__ void update_inputs_v1(bool *not_need_stop,
+                                     int *seq_lens_this_time,
+                                     int *seq_lens_encoder,
+                                     int *seq_lens_decoder,
+                                     int *step_seq_lens_decoder,
+                                     int64_t *prompt_lens,
+                                     int64_t *topk_ids,
+                                     int64_t *input_ids,
+                                     int *block_tables,
+                                     const int64_t *stop_nums,
+                                     bool *stop_flags,
+                                     bool *is_block_step,
+                                     const int64_t *next_tokens,
+                                     const int bsz,
+                                     const int max_bsz,
+                                     const int input_ids_stride,
+                                     const int block_num_per_seq,
+                                     const int block_size) {
+
+
+  // std::cout << "seq_lens_this_time " << seq_lens_this_time[0] << std::endl;
+  int cid = core_id();
+  int ncores = core_num();
+  int clusterid = cluster_id();
+  int nclusters = cluster_num();
+  int thread_idx = clusterid * ncores + cid;
+  if (clusterid != 0) return;
+
+  const int max_bs = 1024;
+  __shared__ bool stop_flags_sm[max_bs];
+  __shared__ int stop_flags_int_sm[max_bs];
+  if(cid == 0){
+    GM2SM(stop_flags, stop_flags_sm, sizeof(bool) * bsz);
+  }
+  sync_all();
+
+  for(int i = cid; i < bsz; i+= ncores){
+    if(i < bsz){
+        stop_flags_sm[i] = stop_flags[i];
+        stop_flags_int_sm[i] = static_cast<int64_t>(stop_flags_sm[i]);
+    }else{
+        stop_flags_sm[i] = true;
+        stop_flags_int_sm[i] = 1;
+    }
+    if(i<bsz){
+        int seq_len_this_time_update = 0;
+        int seq_len_decoder_update = 0;
+        int seq_lens_encoder_update = 0;
+        if(stop_flags_sm[i]){
+            LM2GM(&seq_len_this_time_update, seq_lens_this_time + i, sizeof(int));
+            LM2GM(&seq_len_decoder_update, seq_lens_decoder + i, sizeof(int));
+            LM2GM(&seq_lens_encoder_update, seq_lens_encoder + i, sizeof(int));
+        }else{
+            GM2LM(seq_lens_this_time+i, &seq_len_this_time_update, sizeof(int));
+            GM2LM(seq_lens_decoder+i, &seq_len_decoder_update, sizeof(int));
+            GM2LM(seq_lens_encoder+i, &seq_lens_encoder_update, sizeof(int));
+            int sum_of_seq_lens_this_time_and_seq_lens_decoder = seq_len_this_time_update + seq_len_decoder_update;
+            int prompt_lens_update = 0;
+            GM2LM(prompt_lens+i, &prompt_lens_update, sizeof(int64_t));
+            // decoding
+            if(sum_of_seq_lens_this_time_and_seq_lens_decoder >= prompt_lens_update){
+                seq_len_decoder_update = seq_len_this_time_update + seq_len_decoder_update;
+                LM2GM(&seq_len_decoder_update, seq_lens_decoder+i, sizeof(int));
+                seq_len_this_time_update = 1;
+                LM2GM(&seq_len_this_time_update, seq_lens_this_time + i, sizeof(int));
+                seq_lens_encoder_update = 0;
+                LM2GM(&seq_lens_encoder_update, seq_lens_encoder + i, sizeof(int));
+                int64_t input_ids_update;
+                GM2LM(next_tokens + i, &input_ids_update, sizeof(int64_t));
+                LM2GM(&input_ids_update, input_ids + i * input_ids_stride, sizeof(int64_t));
+                // to judge whether block is not enough
+                if(seq_len_this_time_update != 0 && block_tables[i * block_num_per_seq + seq_len_decoder_update/block_size] == -1){
+                    is_block_step[i] = true;
+                    seq_len_this_time_update = 0;
+                    LM2GM(&seq_len_this_time_update, seq_lens_this_time + i, sizeof(int));
+                    stop_flags_sm[i] = true;
+                    SM2GM(stop_flags_sm+i, stop_flags+i, sizeof(bool));
+                    LM2GM(&seq_len_decoder_update, step_seq_lens_decoder+i, sizeof(int));
+                    seq_len_decoder_update = 0;
+                    LM2GM(&seq_len_decoder_update, seq_lens_decoder + i, sizeof(int));
+                    seq_len_decoder_update = 0;
+                    LM2GM(&seq_len_decoder_update, seq_lens_decoder + i, sizeof(int));
+                    stop_flags_int_sm[i] = 1;
+                }
+            }else{
+                stop_flags_sm[i] = true;
+                SM2GM(stop_flags_sm+i, stop_flags+i, sizeof(bool));
+                seq_len_this_time_update = 0;
+                LM2GM(&seq_len_this_time_update, seq_lens_this_time + i, sizeof(int));
+                seq_len_decoder_update = 0;
+                seq_lens_encoder_update = 0;
+                LM2GM(&seq_len_decoder_update, seq_lens_decoder + i, sizeof(int));
+                LM2GM(&seq_lens_encoder_update, seq_lens_encoder + i, sizeof(int));
+                int64_t topk_ids_update = -1;
+                LM2GM(&topk_ids_update, topk_ids + i, sizeof(int64_t));
+                stop_flags_int_sm[i] = 1;
+            }
+
+        }
+    }
+  }
+  sync_all();
+  sync_cluster();
+  int stop_sum = 0;
+  if (cid == 0) {
+    for (int i = 0; i < max_bsz; i++) {
+      stop_sum += stop_flags_int_sm[i];
+    }
+    // printf("stop_sum : %d\n", stop_sum);
+    int64_t stop_num;
+    GM2LM(stop_nums, &stop_num, sizeof(int64_t));
+    bool not_need_stop_update = stop_sum < static_cast<int>(stop_num);
+    mfence_lm();
+    LM2GM(&not_need_stop_update, not_need_stop, sizeof(bool));
+  }
+}
+
+}  // namespace plugin
+}  // namespace xpu3
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/recover_decode_task.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/recover_decode_task.cpp
@@ -0,0 +1,107 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu/plugin.h"
+#include "xpu/refactor/impl_public/wrapper_check.h"
+#include <algorithm>
+#include <numeric>
+
+namespace xpu3 {
+namespace plugin {
+
+__attribute__((global)) void
+recover_decode_task(bool *stop_flags,
+                                   int *seq_lens_this_time,
+                                   int *seq_lens_encoder,
+                                   int *seq_lens_decoder,
+                                   int *step_seq_lens_decoder,
+                                   int *block_tables,
+                                   bool *is_block_step,
+                                   const int bsz,
+                                   const int block_num_per_seq,
+                                   const int block_size);
+
+} // namespace plugin
+} // namespace xpu3
+
+namespace baidu {
+namespace xpu {
+namespace api {
+namespace plugin {
+
+static int xpu3_wrapper(Context *ctx, bool *stop_flags,
+                                   int *seq_lens_this_time,
+                                   int *seq_lens_encoder,
+                                   int *seq_lens_decoder,
+                                   int *step_seq_lens_decoder,
+                                   int *block_tables,
+                                   bool *is_block_step,
+                                   const int bsz,
+                                   const int block_num_per_seq,
+                                   const int block_size) {
+    using XPU_INT64 = typename XPUIndexType<int64_t>::type;
+    auto recover_decode_task = xpu3::plugin::recover_decode_task;
+    recover_decode_task<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
+        stop_flags,
+                                   seq_lens_this_time,
+                                   seq_lens_encoder,
+                                   seq_lens_decoder,
+                                   step_seq_lens_decoder,
+                                   block_tables,
+                                   is_block_step,
+                                   bsz,
+                                   block_num_per_seq,
+                                   block_size);
+    return api::SUCCESS;
+}
+
+int recover_decode_task(Context *ctx, bool *stop_flags,
+                                   int *seq_lens_this_time,
+                                   int *seq_lens_encoder,
+                                   int *seq_lens_decoder,
+                                   int *step_seq_lens_decoder,
+                                   int *block_tables,
+                                   bool *is_block_step,
+                                   const int bsz,
+                                   const int block_num_per_seq,
+                                   const int block_size) {
+    WRAPPER_CHECK_CTX(ctx);
+    WRAPPER_DUMP_FUNCTION_T1(ctx, "recover_decode_task", int);
+    WRAPPER_DUMP_PARAM5(ctx, stop_flags, seq_lens_this_time,
+                        seq_lens_encoder, seq_lens_decoder, step_seq_lens_decoder);
+    WRAPPER_DUMP_PARAM2(ctx, block_tables, is_block_step);
+    WRAPPER_DUMP_PARAM3(ctx, bsz, block_num_per_seq, block_size);
+    WRAPPER_DUMP(ctx);
+    if (ctx->dev().type() == api::kCPU) {
+        assert(false);
+    }
+    if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+        return xpu3_wrapper(ctx, stop_flags,
+                                   seq_lens_this_time,
+                                   seq_lens_encoder,
+                                   seq_lens_decoder,
+                                   step_seq_lens_decoder,
+                                   block_tables,
+                                   is_block_step,
+                                   bsz,
+                                   block_num_per_seq,
+                                   block_size);
+    }
+    WRAPPER_UNIMPLEMENTED(ctx);
+}
+
+} // namespace plugin
+} // namespace api
+} // namespace xpu
+} // namespace baidu
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/update_inputs_v1.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/update_inputs_v1.cpp
@@ -0,0 +1,149 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xpu/plugin.h"
+#include "xpu/refactor/impl_public/wrapper_check.h"
+#include <algorithm>
+#include <numeric>
+
+namespace xpu3 {
+namespace plugin {
+
+__attribute__((global)) void
+update_inputs_v1(bool *not_need_stop,
+                                     int *seq_lens_this_time,
+                                     int *seq_lens_encoder,
+                                     int *seq_lens_decoder,
+                                     int *step_seq_lens_decoder,
+                                     int64_t *prompt_lens,
+                                     int64_t *topk_ids,
+                                     int64_t *input_ids,
+                                     int *block_tables,
+                                     const int64_t *stop_nums,
+                                     bool *stop_flags,
+                                     bool *is_block_step,
+                                     const int64_t *next_tokens,
+                                     const int bsz,
+                                     const int max_bsz,
+                                     const int input_ids_stride,
+                                     const int block_num_per_seq,
+                                     const int block_size);
+
+} // namespace plugin
+} // namespace xpu3
+
+namespace baidu {
+namespace xpu {
+namespace api {
+namespace plugin {
+
+static int xpu3_wrapper(Context *ctx, bool *not_need_stop,
+                                     int *seq_lens_this_time,
+                                     int *seq_lens_encoder,
+                                     int *seq_lens_decoder,
+                                     int *step_seq_lens_decoder,
+                                     int64_t *prompt_lens,
+                                     int64_t *topk_ids,
+                                     int64_t *input_ids,
+                                     int *block_tables,
+                                     const int64_t *stop_nums,
+                                     bool *stop_flags,
+                                     bool *is_block_step,
+                                     const int64_t *next_tokens,
+                                     const int bsz,
+                                     const int max_bsz,
+                                     const int input_ids_stride,
+                                     const int block_num_per_seq,
+                                     const int block_size) {
+    using XPU_INT64 = typename XPUIndexType<int64_t>::type;
+    auto update_inputs_v1 = xpu3::plugin::update_inputs_v1;
+    // kernel 内要做 reduce，只能用 1 个 cluster
+    update_inputs_v1<<<1, 64, ctx->xpu_stream>>>(
+        not_need_stop,
+                                     seq_lens_this_time,
+                                     seq_lens_encoder,
+                                     seq_lens_decoder,
+                                     step_seq_lens_decoder,
+                                     reinterpret_cast<XPU_INT64 *>(prompt_lens),
+                                     reinterpret_cast<XPU_INT64 *>(topk_ids),
+                                     reinterpret_cast<XPU_INT64 *>(input_ids),
+                                     block_tables,
+                                     reinterpret_cast<const XPU_INT64 *>(stop_nums),
+                                     stop_flags,
+                                     is_block_step,
+                                     reinterpret_cast<const XPU_INT64 *>(next_tokens),
+                                     bsz,
+                                     max_bsz,
+                                     input_ids_stride,
+                                     block_num_per_seq,
+                                     block_size);
+    return api::SUCCESS;
+}
+
+int update_inputs_v1(Context *ctx, bool *not_need_stop,
+                                     int *seq_lens_this_time,
+                                     int *seq_lens_encoder,
+                                     int *seq_lens_decoder,
+                                     int *step_seq_lens_decoder,
+                                     int64_t *prompt_lens,
+                                     int64_t *topk_ids,
+                                     int64_t *input_ids,
+                                     int *block_tables,
+                                     const int64_t *stop_nums,
+                                     bool *stop_flags,
+                                     bool *is_block_step,
+                                     const int64_t *next_tokens,
+                                     const int bsz,
+                                     const int max_bsz,
+                                     const int input_ids_stride,
+                                     const int block_num_per_seq,
+                                     const int block_size) {
+    WRAPPER_CHECK_CTX(ctx);
+    WRAPPER_DUMP_FUNCTION_T1(ctx, "update_inputs_v1", int);
+    WRAPPER_DUMP_PARAM5(ctx, not_need_stop, seq_lens_this_time,
+                        seq_lens_encoder, seq_lens_decoder, step_seq_lens_decoder);
+    WRAPPER_DUMP_PARAM5(ctx, prompt_lens, topk_ids, input_ids, block_tables, stop_nums);
+    WRAPPER_DUMP_PARAM3(ctx, stop_flags, is_block_step, next_tokens);
+    WRAPPER_DUMP_PARAM5(ctx, bsz, max_bsz, input_ids_stride, block_num_per_seq, block_size);
+    WRAPPER_DUMP(ctx);
+    if (ctx->dev().type() == api::kCPU) {
+        assert(false);
+    }
+    if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+        return xpu3_wrapper(ctx, not_need_stop,
+                                     seq_lens_this_time,
+                                     seq_lens_encoder,
+                                     seq_lens_decoder,
+                                     step_seq_lens_decoder,
+                                     prompt_lens,
+                                     topk_ids,
+                                     input_ids,
+                                     block_tables,
+                                     stop_nums,
+                                     stop_flags,
+                                     is_block_step,
+                                     next_tokens,
+                                     bsz,
+                                     max_bsz,
+                                     input_ids_stride,
+                                     block_num_per_seq,
+                                     block_size);
+    }
+    WRAPPER_UNIMPLEMENTED(ctx);
+}
+
+} // namespace plugin
+} // namespace api
+} // namespace xpu
+} // namespace baidu
--- a/custom_ops/xpu_ops/src/setup_ops.py
+++ b/custom_ops/xpu_ops/src/setup_ops.py
@@ -144,6 +144,8 @@ def xpu_setup_ops():
        "./ops/get_token_penalty_multi_scores.cc",
        "./ops/get_padding_offset.cc",
        "./ops/update_inputs.cc",
+        "./ops/recover_decode_task.cc",
+        "./ops/update_inputs_v1.cc",
        "./ops/get_output.cc",
        "./ops/step.cc",
        "./ops/get_infer_param.cc",
--- a/dockerfiles/Dockerfile.gpu
+++ b/dockerfiles/Dockerfile.gpu
@@ -1,10 +1,10 @@
 FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
+ARG PADDLE_VERSION=3.1.0
+ARG FD_VERSION=2.0.0

 ENV DEBIAN_FRONTEND=noninteractive

 WORKDIR /workspace
-RUN rm -rf /workspace/FastDeploy
-COPY . /workspace/FastDeploy

 RUN echo "ulimit -u unlimited" >> /root/.bashrc
 RUN echo "ulimit -n 65536" >> /root/.bashrc
@@ -13,10 +13,10 @@ RUN echo "ulimit -n 65536" >> /root/.bashrc
 RUN python -m pip uninstall paddlepaddle-gpu fastdeploy-gpu -y

 # install paddlepaddle
-RUN python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+RUN python -m pip install --no-cache-dir paddlepaddle-gpu==${PADDLE_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/

 # build and install FastDeploy
-RUN cd FastDeploy && bash build.sh 1 python false [80,90] && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy
+RUN python -m pip install --no-cache-dir  fastdeploy-gpu==${FD_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

 ENV http_proxy=""
 ENV https_proxy=""
--- a/dockerfiles/Dockerfile.xpu
+++ b/dockerfiles/Dockerfile.xpu
@@ -1,4 +1,6 @@
 FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-base-gcc12.3-xpu-xft20250402-v1.1
+ARG PADDLE_VERSION=3.1.0
+ARG FD_VERSION=2.0.0

 WORKDIR /workspace

@@ -14,23 +16,16 @@ RUN apt-get update && apt-get install -y libibverbs-dev librdmacm-dev cmake pybi

 # uninstall existing package
 RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y
-# install paddlepaddle
-RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+# install paddlepaddle-xpu
+RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==${PADDLE_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/

-COPY . /workspace/FastDeploy
+RUN python -m pip install --no-cache-dir fastdeploy-xpu==${FD_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

-# get xtdk and xvllm and xre
 RUN mkdir -p /workspace/deps && cd /workspace/deps && \
    wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
-    tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre && \
-    cd /workspace/FastDeploy && bash custom_ops/xpu_ops/src/download_dependencies.sh stable
+    tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre

 ENV PATH=/workspace/deps/xre/bin:$PATH
-ENV CLANG_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xtdk
-ENV XVLLM_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xvllm
-
-# build and install FastDeploy
-RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy

 ENV http_proxy=""
 ENV https_proxy=""
--- a/docs/features/early_stop.md
+++ b/docs/features/early_stop.md
@@ -0,0 +1,122 @@
+
+# Early Stopping
+
+The early stopping is used to prematurely terminate the token generation of the model. Specifically, the early stopping uses different strategies to determine whether the currently generated token sequence meets the early stopping criteria. If so, token generation is terminated prematurely. FastDeploy currently supports the repetition strategy and stop sequence.
+
+## 1. Repetition Strategy
+* The repetition strategy determines whether to trigger the early stopping function by checking the number of times a high-probability token is generated.
+* Specifically, if the probability of generating a token for a batch exceeds a user-set probability threshold for a specified number of consecutive times, token generation for that batch is terminated prematurely.
+
+### Usage Instructions
+
+When starting the service, add the early stopping function startup option.
+
+* Online inference startup example:
+  * Using default hyperparameters: --enable-early-stop
+    ```shell
+    python -m fastdeploy.entrypoints.openai.api_server \
+    --model baidu/ERNIE-4.5-0.3B-Paddle \
+    --port 8180 \
+    --metrics-port 8181 \
+    --engine-worker-queue-port 8182 \
+    --max-model-len 32768 \
+    --max-num-seqs 32 \
+    --enable-early-stop
+    ```
+  * Using custom hyperparameters: --early-stop-config
+    ```shell
+    python -m fastdeploy.entrypoints.openai.api_server \
+    --model baidu/ERNIE-4.5-0.3B-Paddle \
+    --port 8180 \
+    --metrics-port 8181 \
+    --engine-worker-queue-port 8182 \
+    --max-model-len 32768 \
+    --max-num-seqs 32 \
+    --early-stop-config '{"enable_early_stop":true, "window_size": 1000, "threshold": 0.9}'
+    ```
+* Offline reasoning example
+  * Use default hyperparameter: enable_early_stop
+    ```python
+    from fastdeploy.engine.sampling_params import SamplingParams
+    from fastdeploy.entrypoints.llm import LLM
+
+    model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
+
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
+    llm = LLM(model=model_name_or_path, tensor_parallel_size=1, enable_early_stop=True)
+    output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params)
+
+    print(output)
+    ```
+  * Use custom hyperparameters: early_stop_config
+    ```python
+    from fastdeploy.engine.sampling_params import SamplingParams
+    from fastdeploy.entrypoints.llm import LLM
+
+    model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
+    early_stop_config = {"enable_early_stop":True, "window_size":1000, "threshold":0.9}
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
+    llm = LLM(model=model_name_or_path, tensor_parallel_size=1, early_stop_config=early_stop_config) output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params)
+
+    print(output)
+    ```
+
+### Parameter Description
+
+* `enable_early_stop`: (bool) Whether to enable the early stopping. Default False.
+
+* `strategy`: (str) The strategy used by the early stopping. Currently, only the repetition strategy is supported. Default "repetition".
+
+* `window_size`: (int) The upper limit of the number of consecutive high-probability tokens in the repetition strategy. If the number exceeds this limit, the early stopping will be triggered. Default 3000.
+
+* `threshold`: (float) The high-probability threshold in the repetition strategy. Default 0.99.
+
+## 2. Stop Sequence
+* The Stop Sequence strategy determines whether to trigger early stopping by checking whether the generated token sequence contains a user-specified stop sequence.
+
+* Specifically, if the token sequence generated by a batch contains a user-specified stop sequence, token generation for that batch is terminated prematurely.
+
+### Usage Instructions
+Before starting the service, set the following environment variables
+
+```
+FD_STOP_SEQS_MAX_LEN (Maximum length of stop sequences, default is 8)
+
+FD_MAX_STOP_SEQS_NUM (Maximum number of stop sequences, default is 5)
+```
+
+request with stop parameter， it can be str or List[str]
+
+* online serving, set `stop` parameter in request
+```
+# create a chat request with "stop" parameter
+import openai
+ip = "0.0.0.0"
+service_http_port = "8233"
+client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "user", "content": '今天天气真好'},
+    ],
+    temperature=1.0,
+    top_p=0,
+    stream=False,
+    stop=["明天", "出去走走"]
+)
+```
+
+* offline LLM, set `stop_seqs` parameter in `SamplingParams`
+```
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.llm import LLM
+
+model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
+
+sampling_params = SamplingParams(temperature=1, top_p=0, stop=["出去走走"])
+llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
+output = llm.chat(messages=[{"role": "user", "content": "今天天气真好"}], use_tqdm=True, sampling_params=sampling_params)
+
+print(output)
+
+```
--- a/docs/features/load_balance.md
+++ b/docs/features/load_balance.md
@@ -64,6 +64,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --port 8801 \
       --metrics-port 8802 \
       --engine-worker-queue-port 8803 \
+       --model baidu/ERNIE-4.5-0.3B-Paddle \
       --scheduler-name global \
       --scheduler-ttl 900 \
       --scheduler-host "127.0.0.1" \
@@ -71,7 +72,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --scheduler-db 0 \
       --scheduler-password "" \
       --scheduler-topic "default" \
-       --scheduler-min-load_score 3 \
+       --scheduler-min-load-score 3 \
       --scheduler-load-shards-num 1
 ```

--- a/docs/features/reasoning_output.md
+++ b/docs/features/reasoning_output.md
@@ -8,14 +8,14 @@ Reasoning models return an additional `reasoning_content` field in their output,
 | baidu/ERNIE-4.5-VL-424B-A47B-Paddle    | ernie-45-vl    | ✓                         |
 | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl    | ✓                         |

-The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `enable_thinking=False` parameter.
+The reasoning model requires a specified parser to extract reasoning content. The reasoning mode can be disabled by setting the `"enable_thinking": false` parameter.

 Interfaces that support toggling the reasoning mode:
 1. `/v1/chat/completions` requests in OpenAI services.
 2. `/v1/chat/completions` requests in the OpenAI Python client.
 3. `llm.chat` requests in Offline interfaces.

-For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `metadata={"reasoning_max_tokens": 1024}` to the request.
+For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `"reasoning_max_tokens": 1024` to the request.

 ### Quick Start
 When launching the model service, specify the parser name using the `--reasoning-parser` argument.
@@ -43,7 +43,8 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
      {"type": "text", "text": "Which era does the cultural relic in the picture belong to"}
    ]}
  ],
-  "metadata": {"enable_thinking": true}
+  "chat_template_kwargs":{"enable_thinking": true},
+  "reasoning_max_tokens": 1024
 }'
 ```

@@ -68,7 +69,10 @@ chat_response = client.chat.completions.create(
    ],
    model="vl",
    stream=True,
-    metadata={"enable_thinking": True}
+    extra_body={
+      "chat_template_kwargs":{"enable_thinking": True},
+      "reasoning_max_tokens": 1024
+    }
 )
 for chunk in chat_response:
    if chunk.choices[0].delta is not None:
--- a/docs/features/sampling.md
+++ b/docs/features/sampling.md
@@ -0,0 +1,225 @@
+# Sampling Strategies
+
+Sampling strategies are used to determine how to select the next token from the output probability distribution of a model. FastDeploy currently supports multiple sampling strategies including Top-p, Top-k_Top-p, and Min-p Sampling.
+
+1. Top-p Sampling
+
+   * Top-p sampling truncates the probability cumulative distribution, considering only the most likely token set that reaches a specified threshold p.
+   * It dynamically selects the number of tokens considered, ensuring diversity in the results while avoiding unlikely tokens.
+
+2. Top-k_Top-p Sampling
+
+   * Initially performs top-k sampling, then normalizes within the top-k results, and finally performs top-p sampling.
+   * By limiting the initial selection range (top-k) and then accumulating probabilities within it (top-p), it improves the quality and coherence of the generated text.
+
+3. Min-p Sampling
+
+   * Min-p sampling calculates `pivot=max_prob * min_p`, then retains only tokens with probabilities greater than the `pivot` (setting others to zero) for subsequent sampling.
+   * It filters out tokens with relatively low probabilities, sampling only from high-probability tokens to improve generation quality.
+
+## Usage Instructions
+
+During deployment, you can choose the sampling algorithm by setting the environment variable `FD_SAMPLING_CLASS`. Available values are `base`, `base_non_truncated`, `air`, or `rejection`.
+
+**Algorithms Supporting Only Top-p Sampling**
+
+* `base` (default): Directly normalizes using the `top_p` value, favoring tokens with greater probabilities.
+* `base_non_truncated`: Strictly follows the Top-p sampling logic, first selecting the smallest set that reaches the cumulative probability of `top_p`, then normalizing these selected elements.
+* `air`: This algorithm is inspired by [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and supports Top-p sampling.
+
+**Algorithms Supporting Top-p and Top-k_Top-p Sampling**
+
+* `rejection`: This algorithm is inspired by [flashinfer](https://github.com/flashinfer-ai/flashinfer) and allows flexible settings for `top_k` and `top_p` parameters for Top-p or Top-k_Top-p sampling.
+
+## Configuration Method
+
+### Top-p Sampling
+
+1. During deployment, set the environment variable to select the sampling algorithm, default is base:
+
+```bash
+export FD_SAMPLING_CLASS=rejection # base, base_non_truncated, or air
+```
+2. When sending a request, specify the following parameters:
+
+* Example request with curl:
+
+```bash
+
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "top_p": 0.8
+}'
+```
+
+* Example request with Python:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+    ],
+    stream=True,
+    top_p=0.8
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+### Top-k_Top-p Sampling
+
+1. During deployment, set the environment variable to select the rejection sampling algorithm:
+
+```bash
+export FD_SAMPLING_CLASS=rejection
+```
+
+2. When sending a request, specify the following parameters:
+
+* Example request with curl:
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "top_p": 0.8,
+  "top_k": 20
+}'
+```
+
+* Example request with Python:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+    ],
+    stream=True,
+    top_p=0.8,
+    extra_body={"top_k": 20, "min_p":0.1}
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+### Min-p Sampling
+
+If you want to use min-p sampling before top-p or top-k_top-p sampling, specify the following parameters when sending a request:
+
+* Example request with curl:
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "min_p": 0.1,
+  "top_p": 0.8,
+  "top_k": 20
+}'
+```
+
+* Example request with Python:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+    ],
+    stream=True,
+    top_p=0.8,
+    extra_body={"top_k": 20, "min_p":0.1}
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+With the above configurations, you can flexibly choose and use the appropriate sampling strategy according to the needs of specific generation tasks.
+
+## Parameter Description
+
+`top_p`: The probability cumulative distribution truncation threshold, considering only the most likely token set that reaches this threshold. It is a float type, with a range of [0.0, 1.0]. When top_p=1.0, all tokens are considered; when top_p=0.0, it degenerates into greedy search.
+
+`top_k`: The number of tokens with the highest sampling probability, limiting the sampling range to the top k tokens. It is an int type, with a range of [0, vocab_size].
+
+`min_p`: Low probability filtering threshold, considering only the token set with probability greater than or equal to (`max_prob*min_p`). It is a float type, with a range of [0.0, 1.0].
+
+# Bad Words
+
+Used to prevent the model from generating certain specific words during the inference process. Commonly applied in safety control, content filtering, and behavioral constraints of the model.
+
+## Usage Instructions
+
+Include the `bad_words` parameter in the request:
+
+* Example request with curl:
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "bad_words": ["age", "I"]
+}'
+```
+
+* Example request with Python:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+    ],
+    extra_body={"bad_words": ["you", "me"]},
+    stream=True,
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+## Parameter Description
+
+`bad_words`: List of forbidden words. Type: list of str. Each word must be a single token.
--- a/docs/get_started/ernie-4.5-vl.md
+++ b/docs/get_started/ernie-4.5-vl.md
@@ -113,7 +113,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
      {"type": "text", "text": "From which era does the artifact in the image originate?"}
    ]}
  ],
-  "metadata": {"enable_thinking": false}
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```

--- a/docs/get_started/ernie-4.5.md
+++ b/docs/get_started/ernie-4.5.md
@@ -1,6 +1,7 @@
 # Deploy ERNIE-4.5-300B-A47B Model

 This document explains how to deploy the ERNIE-4.5 model. Before starting the deployment, please ensure that your hardware environment meets the following requirements:
+
 - GPU Driver >= 535
 - CUDA >= 12.3
 - CUDNN >= 9.5
--- a/docs/get_started/installation/kunlunxin_xpu.md
+++ b/docs/get_started/installation/kunlunxin_xpu.md
@@ -5,7 +5,7 @@
 - OS: Linux
 - Python: 3.10
 - XPU Model: P800
- XPU Driver Version: ≥ 5.0.21.10
+- XPU Driver Version: ≥ 5.0.21.26
 - XPU Firmware Version: ≥ 1.31

 Verified platform:
@@ -15,7 +15,7 @@ Verified platform:
 - OS: CentOS release 7.6 (Final)
 - Python: 3.10
 - XPU Model: P800 (OAM Edition)
- XPU Driver Version: 5.0.21.10
+- XPU Driver Version: 5.0.21.26
 - XPU Firmware Version: 1.31

 **Note:** Currently, only INTEL or Hygon CPU-based P800 (OAM Edition) servers have been verified. Other CPU types and P800 (PCIe Edition) servers have not been tested yet.
@@ -25,9 +25,9 @@ Verified platform:
 ```bash
 mkdir Work
 cd Work
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
-    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0 \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \
    /bin/bash
 docker exec -it fastdeploy-xpu /bin/bash
 ```
@@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash
 ### Install PaddlePaddle

 ```bash
-python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```

 Alternatively, you can install the latest version of PaddlePaddle (Not recommended)
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
 ### Install FastDeploy (**Do NOT install via PyPI source**)

 ```bash
-python -m pip install fastdeploy-xpu==2.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 ```

 Alternatively, you can install the latest version of FastDeploy (Not recommended)
@@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa
 ### Install PaddlePaddle

 ```bash
-python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```

 Alternatively, you can install the latest version of PaddlePaddle (Not recommended)
--- a/docs/get_started/quick_start_vl.md
+++ b/docs/get_started/quick_start_vl.md
@@ -74,7 +74,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
      {"type": "text", "text": "What era does this artifact belong to?"}
    ]}
  ],
-  "metadata": {"enable_thinking": false}
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```

@@ -96,7 +96,7 @@ response = client.chat.completions.create(
            {"type": "text", "text": "What era does this artifact belong to?"},
        ]},
    ],
-    metadata={"enable_thinking": false},
+    extra_body={"enable_thinking": false},
    stream=True,
 )
 for chunk in response:
--- a/docs/offline_inference.md
+++ b/docs/offline_inference.md
@@ -183,6 +183,7 @@ For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md).
 * min_p(float): Minimum probability relative to the maximum probability for a token to be considered (>0 filters low-probability tokens to improve quality)
 * max_tokens(int): Maximum generated tokens (input + output)
 * min_tokens(int): Minimum forced generation length
+* bad_words(list[str]): Prohibited words

 ### 2.5 fastdeploy.engine.request.RequestOutput

--- a/docs/online_serving/README.md
+++ b/docs/online_serving/README.md
@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \

 For more usage methods of the command line during service deployment, refer to [Parameter Descriptions](../parameters.md).

-## Sending User Requests
+## Chat Completion API
+FastDeploy provides a Chat Completion API that is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.

-The FastDeploy interface is compatible with the OpenAI protocol, allowing user requests to be sent directly using OpenAI's request method.
+### Sending User Requests

 Here is an example of sending a user request using the curl command:

@@ -73,53 +74,327 @@ print('\n')

 For a description of the OpenAI protocol, refer to the document [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create).

-## Parameter Differences
-### Request Parameter Differences
-The differences in request parameters between FastDeploy and the OpenAI protocol are as follows. Other request parameters will be ignored:
+### Compatible OpenAI Parameters
+```python
+messages: Union[List[Any], List[int]]
+# List of input messages, which can be text messages (`List[Any]`, typically `List[dict]`) or token ID lists (`List[int]`).

- `prompt` (supported only in the `v1/completions` interface)
- `messages` (supported only in the `v1/chat/completions` interface)
- `logprobs`: Optional[bool] = False (supported only in the `v1/chat/completions` interface)
- `top_logprobs`: Optional[int] = None (supported only in the `v1/chat/completions` interface. An integer between 0 and 20,logprobs must be set to true if this parameter is used)
- `frequency_penalty`: Optional[float] = 0.0
- `max_tokens`: Optional[int] = 16
- `presence_penalty`: Optional[float] = 0.0
- `stream`: Optional[bool] = False
- `stream_options`: Optional[StreamOptions] = None
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `metadata`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `metadata={"enable_thinking": True}`)
-  - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
-  - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
-  - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
-  - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
+tools: Optional[List[ChatCompletionToolsParam]] = None
+# List of tool call configurations, used for enabling function calling (Function Calling) or tool usage (e.g., ReAct framework).

-> Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used.
+model: Optional[str] = "default"
+# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).

-### Return Field Differences
+frequency_penalty: Optional[float] = None
+# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition, default `None` disables).

-The additional return fields added by FastDeploy are as follows:
+logprobs: Optional[bool] = False
+# Whether to return the log probabilities of each generated token, used for debugging or analysis.

- `arrival_time`: Returns the cumulative time taken for all tokens
- `reasoning_content`: The returned result of the reasoning chain
+top_logprobs: Optional[int] = 0
+# Returns the top `top_logprobs` tokens and their log probabilities for each generated position (default `0` means no return).
+
+max_tokens: Optional[int] = Field(
+    default=None,
+    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+)
+# Deprecated: Maximum number of tokens to generate (recommended to use `max_completion_tokens` instead).
+
+max_completion_tokens: Optional[int] = None
+# Maximum number of tokens to generate (recommended alternative to `max_tokens`), no default limit (restricted by the model's context window).
+
+presence_penalty: Optional[float] = None
+# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics, default `None` disables).
+
+stream: Optional[bool] = False
+# Whether to enable streaming output (return results token by token), default `False` (returns complete results at once).
+
+stream_options: Optional[StreamOptions] = None
+# Additional configurations for streaming output (such as chunk size, timeout, etc.), refer to the specific definition of `StreamOptions`.
+
+temperature: Optional[float] = None
+# Temperature coefficient, controlling generation randomness (`0.0` for deterministic generation, `>1.0` for more randomness, default `None` uses model default).
+
+top_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds `top_p` (default `None` disables).
+
+response_format: Optional[AnyResponseFormat] = None
+# Specifies the output format (such as JSON, XML, etc.), requires passing a predefined format configuration object.
+
+user: Optional[str] = None
+# User identifier, used for tracking or distinguishing requests from different users (default `None` does not pass).
+
+metadata: Optional[dict] = None
+# Additional metadata, used for passing custom information (such as request ID, debug markers, etc.).
+
+```
+
+### Additional Parameters Added by FastDeploy
+
+> Note:
+When sending requests using curl, the following parameters can be used directly;
+When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
+
+The following sampling parameters are supported.
+```python
+top_k: Optional[int] = None
+# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
+
+min_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
+
+min_tokens: Optional[int] = None
+# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
+
+include_stop_str_in_output: Optional[bool] = False
+# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
+
+bad_words: Optional[List[str]] = None
+# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
+
+repetition_penalty: Optional[float] = None
+# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
+```
+
+The following extra parameters are supported:
+```python
+chat_template_kwargs: Optional[dict] = None
+# Additional parameters passed to the chat template, used for customizing dialogue formats (default None).
+
+reasoning_max_tokens: Optional[int] = None
+# Maximum number of tokens to generate during reasoning (e.g., CoT, chain of thought) (default None means using global max_tokens).
+
+structural_tag: Optional[str] = None
+# Structural tag, used to mark specific structures of generated content (such as JSON, XML, etc., default None).
+
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
+
+guided_regex: Optional[str] = None
+# Guides the generation of content conforming to regular expression rules (default None means no restriction).
+
+guided_choice: Optional[List[str]] = None
+# Guides the generation of content selected from a specified candidate list (default None means no restriction).
+
+guided_grammar: Optional[str] = None
+# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
+
+return_token_ids: Optional[bool] = None
+# Whether to return the token IDs of the generation results instead of text (default None means return text).
+
+prompt_token_ids: Optional[List[int]] = None
+# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
+
+max_streaming_response_tokens: Optional[int] = None
+# Maximum number of tokens returned at a time during streaming output (default None means no limit).
+
+disable_chat_template: Optional[bool] = False
+# Whether to disable chat template rendering, using raw input directly (default False means template is enabled).
+```
+
+### Differences in Return Fields
+
+Additional return fields added by FastDeploy:
+
+- `arrival_time`: Cumulative time consumed for all tokens
+- `reasoning_content`: Return results of the chain of thought
+- `prompt_token_ids`: List of token IDs for the input sequence
+- `completion_token_ids`: List of token IDs for the output sequence

 Overview of return parameters:

 ```python
+
+ChatCompletionResponse:
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+ChatCompletionResponseChoice:
+    index: int
+    message: ChatMessage
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
+ChatMessage:
+    role: str
+    content: str
+    reasoning_content: Optional[str] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+
+# Fields returned for streaming responses
 ChatCompletionStreamResponse:
    id: str
    object: str = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[ChatCompletionResponseStreamChoice]
- ChatCompletionResponseStreamChoice:
+    usage: Optional[UsageInfo] = None
+ChatCompletionResponseStreamChoice:
    index: int
    delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
    arrival_time: Optional[float] = None
 DeltaMessage:
    role: Optional[str] = None
    content: Optional[str] = None
-    token_ids: Optional[List[int]] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
    reasoning_content: Optional[str] = None
 ```
+
+## Completion API
+The Completion API interface is mainly used for continuation scenarios, suitable for users who have customized context input and expect the model to only output continuation content; the inference process does not add other `prompt` concatenations.
+
+### Sending User Requests
+
+Here is an example of sending a user request using the curl command:
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析："
+}'
+```
+
+Here is an example of sending a user request using a Python script:
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.completions.create(
+    model="default",
+    prompt="以下是一篇关于深圳文心公园的500字游记和赏析：",
+    stream=False,
+)
+print(response.choices[0].text)
+```
+
+For an explanation of the OpenAI protocol, refer to the [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
+
+### Compatible OpenAI Parameters
+```python
+model: Optional[str] = "default"
+# Specifies the model name or version to use, defaulting to `"default"` (which may point to the base model).
+
+prompt: Union[List[int], List[List[int]], str, List[str]]
+# Input prompt, supporting multiple formats:
+#   - `str`: Plain text prompt (e.g., `"Hello, how are you?"`).
+#   - `List[str]`: Multiple text segments (e.g., `["User:", "Hello!", "Assistant:", "Hi!"]`).
+#   - `List[int]`: Directly passes a list of token IDs (e.g., `[123, 456]`).
+#   - `List[List[int]]`: List of multiple token ID lists (e.g., `[[123], [456, 789]]`).
+
+best_of: Optional[int] = None
+# Generates `best_of` candidate results and returns the highest-scoring one (requires `n=1`).
+
+frequency_penalty: Optional[float] = None
+# Frequency penalty coefficient, reducing the probability of generating the same token repeatedly (`>1.0` suppresses repetition, `<1.0` encourages repetition).
+
+logprobs: Optional[int] = None
+# Returns the log probabilities of each generated token, can specify the number of candidates to return.
+
+max_tokens: Optional[int] = None
+# Maximum number of tokens to generate (including input and output), no default limit (restricted by the model's context window).
+
+presence_penalty: Optional[float] = None
+# Presence penalty coefficient, reducing the probability of generating new topics (unseen topics) (`>1.0` suppresses new topics, `<1.0` encourages new topics).
+```
+
+### Additional Parameters Added by FastDeploy
+
+> Note:
+When sending requests using curl, the following parameters can be used directly;
+When sending requests using openai.Client, these parameters need to be placed in the `extra_body` parameter, e.g. `extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`.
+
+The following sampling parameters are supported.
+```python
+top_k: Optional[int] = None
+# Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit).
+
+min_p: Optional[float] = None
+# Nucleus sampling threshold, only retaining tokens whose cumulative probability exceeds min_p (default None means disabled).
+
+min_tokens: Optional[int] = None
+# Forces a minimum number of tokens to be generated, avoiding premature truncation (default None means no limit).
+
+include_stop_str_in_output: Optional[bool] = False
+# Whether to include the stop string content in the output (default False, meaning output is truncated when a stop string is encountered).
+
+bad_words: Optional[List[str]] = None
+# List of forbidden words (e.g., sensitive words) that the model should avoid generating (default None means no restriction).
+
+repetition_penalty: Optional[float] = None
+# Repetition penalty coefficient, reducing the probability of repeating already generated tokens (`>1.0` suppresses repetition, `<1.0` encourages repetition, default None means disabled).
+```
+
+The following extra parameters are supported:
+```python
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# Guides the generation of content conforming to JSON structure, can be a JSON string, dictionary, or Pydantic model (default None).
+
+guided_regex: Optional[str] = None
+# Guides the generation of content conforming to regular expression rules (default None means no restriction).
+
+guided_choice: Optional[List[str]] = None
+# Guides the generation of content selected from a specified candidate list (default None means no restriction).
+
+guided_grammar: Optional[str] = None
+# Guides the generation of content conforming to grammar rules (such as BNF) (default None means no restriction).
+
+return_token_ids: Optional[bool] = None
+# Whether to return the token IDs of the generation results instead of text (default None means return text).
+
+prompt_token_ids: Optional[List[int]] = None
+# Directly passes the token ID list of the prompt, skipping the text encoding step (default None means using text input).
+
+max_streaming_response_tokens: Optional[int] = None
+# Maximum number of tokens returned at a time during streaming output (default None means no limit).
+```
+
+### Overview of Return Parameters
+
+```python
+
+CompletionResponse:
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+CompletionResponseChoice:
+    index: int
+    text: str
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    arrival_time: Optional[float] = None
+    logprobs: Optional[int] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
+
+# Fields returned for streaming responses
+CompletionStreamResponse：
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+CompletionResponseStreamChoice:
+    index: int
+    text: str
+    arrival_time: float = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    logprobs: Optional[float] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+
+```
--- a/docs/optimal_deployment/ERNIE-4.5-0.3B-Paddle.md
+++ b/docs/optimal_deployment/ERNIE-4.5-0.3B-Paddle.md
@@ -0,0 +1,93 @@
+# ERNIE-4.5-0.3B
+## Environmental Preparation
+### 1.1 Hardware requirements
+The minimum number of GPUs required to deploy `ERNIE-4.5-0.3B` on the following hardware for each quantization is as follows:
+|  | WINT8 | WINT4 | FP8 |
+|-----|-----|-----|-----|
+|H800 80GB| 1 | 1 | 1 |
+|A800 80GB| 1 | 1 | / |
+|H20 96GB| 1 | 1 | 1 |
+|L20 48GB| 1 | 1 | 1 |
+|A30 40GB| 1 | 1 | / |
+|A10 24GB| 1 | 1 | / |
+
+**Tips:**
+1. To modify the number of deployment GPUs, specify `--tensor-parallel-size 2` in starting command.
+2. For hardware not listed in the table, you can estimate whether it can be deployed based on the GPU memory.
+
+### 1.2 Install fastdeploy
+- Installation: For detail, please refer to [Fastdeploy Installation](../get_started/installation/README.md).
+
+- Model Download，For detail, please refer to [Supported Models](../supported_models.md). **Please note that models with Paddle suffix need to be used for Fastdeploy**：
+
+## 2.How to Use
+### 2.1 Basic: Launching the Service
+Start the service by following command:
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-0.3B-Paddle \
+       --tensor-parallel-size 1 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
+- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
+
+For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
+
+### 2.2 Advanced: How to get better performance
+#### 2.2.1 Correctly set parameters that match the application scenario
+Evaluate average input length, average output length, and maximum context length
+- Set max-model-len according to the maximum context length. For example, if the average input length is 1000 and the output length is 30000, then it is recommended to set it to 32768
+- **Enable the service management global block**
+
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**Idea:** The core idea of Prefix Caching is to avoid repeated calculations by caching the intermediate calculation results of the input sequence (KV Cache), thereby speeding up the response speed of multiple requests with the same prefix. For details, refer to [prefix-cache](../features/prefix_caching.md)
+
+**How to enable:**
+Add the following lines to the startup parameters, where `--enable-prefix-caching` enables prefix caching, and `--swap-space` enables CPU cache in addition to GPU cache. The size is GB and should be adjusted according to the actual situation of the machine.
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**Idea:** This strategy is adopted to split the prefill stage request into small-scale sub-chunks, and execute them in batches mixed with the decode request. This can better balance the computation-intensive (Prefill) and memory-intensive (Decode) operations, optimize GPU resource utilization, reduce the computational workload and memory usage of a single Prefill, thereby reducing the peak memory usage and avoiding the problem of insufficient memory. For details, please refer to [Chunked Prefill](../features/chunked_prefill.md)
+
+**How to enable:** Add the following lines to the startup parameters
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 CudaGraph
+**Idea:**
+CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance.
+
+**How to enable:**
+Add the following lines to the startup parameters
+```
+--use-cudagraph
+```
+Notes:
+1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
+2. When CUDAGraph is enabled, only single-card inference is supported, that is, `--tensor-parallel-size 1`
+3. When CUDAGraph is enabled, it is not supported to enable `Chunked Prefill` and `Prefix Caching` at the same time
+
+#### 2.2.6 Rejection Sampling
+**Idea:**
+Rejection sampling is to generate samples from a proposal distribution that is easy to sample, avoiding explicit sorting to increase the sampling speed, which has a significant improvement on small-sized models.
+
+**How to enable:**
+Add the following environment variables before starting
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+## FAQ
+If you encounter any problems during use, you can refer to [FAQ](./FAQ.md).
--- a/docs/optimal_deployment/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/optimal_deployment/ERNIE-4.5-21B-A3B-Paddle.md
@@ -0,0 +1,149 @@
+# ERNIE-4.5-21B-A3B
+## Environmental Preparation
+### 1.1 Hardware requirements
+The minimum number of GPUs required to deploy `ERNIE-4.5-21B-A3B` on the following hardware for each quantization is as follows:
+|  | WINT8 | WINT4 | FP8 |
+|-----|-----|-----|-----|
+|H800 80GB| 1 | 1 | 1 |
+|A800 80GB| 1 | 1 | / |
+|H20 96GB| 1 | 1 | 1 |
+|L20 48GB| 1 | 1 | 1 |
+|A30 40GB| 2 | 1 | / |
+|A10 24GB| 2 | 1 | / |
+
+**Tips:**
+1. To modify the number of deployment GPUs, specify `--tensor-parallel-size 2` in starting command.
+2. For hardware not listed in the table, you can estimate whether it can be deployed based on the GPU memory.
+
+### 1.2 Install fastdeploy and prepare the model
+- Installation: For detail, please refer to [Fastdeploy Installation](../get_started/installation/README.md).
+
+- Model Download，For detail, please refer to [Supported Models](../supported_models.md). **Please note that models with Paddle suffix need to be used for Fastdeploy**：
+
+## 2.How to Use
+### 2.1 Basic: Launching the Service
+Start the service by following command:
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+       --tensor-parallel-size 1 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
+- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
+
+For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
+
+### 2.2 Advanced: How to get better performance
+#### 2.2.1 Correctly set parameters that match the application scenario
+Evaluate average input length, average output length, and maximum context length
+- Set max-model-len according to the maximum context length. For example, if the average input length is 1000 and the output length is 30000, then it is recommended to set it to 32768
+- **Enable the service management global block**
+
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**Idea:** The core idea of Prefix Caching is to avoid repeated calculations by caching the intermediate calculation results of the input sequence (KV Cache), thereby speeding up the response speed of multiple requests with the same prefix. For details, refer to [prefix-cache](../features/prefix_caching.md)
+
+**How to enable:**
+Add the following lines to the startup parameters, where `--enable-prefix-caching` enables prefix caching, and `--swap-space` enables CPU cache in addition to GPU cache. The size is GB and should be adjusted according to the actual situation of the machine.
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**Idea:** This strategy is adopted to split the prefill stage request into small-scale sub-chunks, and execute them in batches mixed with the decode request. This can better balance the computation-intensive (Prefill) and memory-intensive (Decode) operations, optimize GPU resource utilization, reduce the computational workload and memory usage of a single Prefill, thereby reducing the peak memory usage and avoiding the problem of insufficient memory. For details, please refer to [Chunked Prefill](../features/chunked_prefill.md)
+
+**How to enable:** Add the following lines to the startup parameters
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 MTP (Multi-Token Prediction)
+**Idea:**
+By predicting multiple tokens at once, the number of decoding steps is reduced to significantly speed up the generation speed, while maintaining the generation quality through certain strategies. For details, please refer to [Speculative Decoding](../features/speculative_decoding.md)。
+
+**How to enable:**
+Add the following lines to the startup parameters
+```
+--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}'
+```
+
+#### 2.2.5 CUDAGraph
+**Idea:**
+CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance.
+
+**How to enable:**
+Add the following lines to the startup parameters
+```
+--use-cudagraph
+```
+Notes:
+1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
+2. When CUDAGraph is enabled, only single-card inference is supported, that is, `--tensor-parallel-size 1`
+3. When CUDAGraph is enabled, it is not supported to enable `Chunked Prefill` and `Prefix Caching` at the same time
+
+#### 2.2.6 Rejection Sampling
+**Idea:**
+Rejection sampling is to generate samples from a proposal distribution that is easy to sample, avoiding explicit sorting to increase the sampling speed, which has a significant improvement on small-sized models.
+
+**How to enable:**
+Add the following environment variables before starting
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+#### 2.2.7 Disaggregated Deployment
+**Idea:** Deploying Prefill and Decode separately in certain scenarios can improve hardware utilization, effectively increase throughput, and reduce overall sentence latency.
+
+**How to enable:** Take the deployment of a single machine with 8 GPUs and 1P1D (4 GPUs each) as an example. Compared with the default hybrid deployment method, `--splitwise-role` is required to specify the role of the node. And the GPUs and logs of the two nodes are isolated through the environment variables `FD_LOG_DIR` and `CUDA_VISIBLE_DEVICES`.
+```
+# prefill
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export INFERENCE_MSG_QUEUE_ID=1315
+export FLAGS_max_partition_size=2048
+export FD_ATTENTION_BACKEND=FLASH_ATTN
+export FD_LOG_DIR="prefill_log"
+
+quant_type=block_wise_fp8
+export FD_USE_DEEP_GEMM=0
+
+python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+    --max-model-len 131072 \
+    --max-num-seqs 20 \
+    --num-gpu-blocks-override 40000 \
+    --quantization ${quant_type} \
+    --gpu-memory-utilization 0.9 --kv-cache-ratio 0.9 \
+    --port 7012 --engine-worker-queue-port 7013 --metrics-port 7014 --tensor-parallel-size 4 \
+    --cache-queue-port 7015 \
+    --splitwise-role "prefill" \
+```
+```
+# decode
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+export INFERENCE_MSG_QUEUE_ID=1215
+export FLAGS_max_partition_size=2048
+export FD_LOG_DIR="decode_log"
+
+quant_type=block_wise_fp8
+export FD_USE_DEEP_GEMM=0
+
+python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+    --max-model-len 131072 \
+    --max-num-seqs 20 \
+    --quantization ${quant_type} \
+    --gpu-memory-utilization 0.85 --kv-cache-ratio 0.1 \
+    --port 9012 --engine-worker-queue-port 8013 --metrics-port 8014 --tensor-parallel-size 4 \
+    --cache-queue-port 8015 \
+    --innode-prefill-ports 7013 \
+    --splitwise-role "decode"
+```
+
+## FAQ
+If you encounter any problems during use, you can refer to [FAQ](./FAQ.md).
--- a/docs/optimal_deployment/ERNIE-4.5-300B-A47B-Paddle.md
+++ b/docs/optimal_deployment/ERNIE-4.5-300B-A47B-Paddle.md
@@ -0,0 +1,127 @@
+# ERNIE-4.5-300B-A47B
+## Environmental Preparation
+### 1.1 Hardware requirements
+The minimum number of GPUs required to deploy `ERNIE-4.5-300B-A47B` on the following hardware for each quantization is as follows:
+|  | WINT8 | WINT4 | FP8 | WINT2 | W4A8 |
+|-----|-----|-----|-----|-----|-----|
+|H800 80GB| 8 | 4 | 8 | 2 | 4 |
+|A800 80GB| 8 | 4 | / | 2 | 4 |
+
+**Tips:**
+1. To modify the number of deployment GPUs, specify `--tensor-parallel-size 4` in starting command.
+2. Since only 4-GPSs quantization scale is provided, the W4A8 model needs to be deployed on 4 GPUs.
+3. For hardware not listed in the table, you can estimate whether it can be deployed based on the GPU memory.
+
+### 1.2 Install fastdeploy
+- Installation: For detail, please refer to [Fastdeploy Installation](../get_started/installation/README.md).
+
+- Model Download，For detail, please refer to [Supported Models](../supported_models.md). **Please note that models with Paddle suffix need to be used for Fastdeploy**：
+
+## 2.How to Use
+### 2.1 Basic: Launching the Service
+Start the service by following command:
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+       --tensor-parallel-size 8 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
+- `--max-model-len`: Indicates the maximum number of tokens supported by the currently deployed service. The larger the value, the longer the context length the model can support, but the more GPU memory is occupied, which may affect the concurrency.
+
+For more parameter meanings and default settings, see [FastDeploy Parameter Documentation](../parameters.md)。
+
+### 2.2 Advanced: How to get better performance
+#### 2.2.1 Correctly set parameters that match the application scenario
+Evaluate average input length, average output length, and maximum context length
+- Set max-model-len according to the maximum context length. For example, if the average input length is 1000 and the output length is 30000, then it is recommended to set it to 32768
+- **Enable the service management global block**
+
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**Idea:** The core idea of Prefix Caching is to avoid repeated calculations by caching the intermediate calculation results of the input sequence (KV Cache), thereby speeding up the response speed of multiple requests with the same prefix. For details, refer to [prefix-cache](../features/prefix_caching.md)
+
+**How to enable:**
+Add the following lines to the startup parameters, where `--enable-prefix-caching` enables prefix caching, and `--swap-space` enables CPU cache in addition to GPU cache. The size is GB and should be adjusted according to the actual situation of the machine.
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**Idea:** This strategy is adopted to split the prefill stage request into small-scale sub-chunks, and execute them in batches mixed with the decode request. This can better balance the computation-intensive (Prefill) and memory-intensive (Decode) operations, optimize GPU resource utilization, reduce the computational workload and memory usage of a single Prefill, thereby reducing the peak memory usage and avoiding the problem of insufficient memory. For details, please refer to [Chunked Prefill](../features/chunked_prefill.md)
+
+**How to enable:** Add the following lines to the startup parameters
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 MTP (Multi-Token Prediction)
+**Idea:**
+By predicting multiple tokens at once, the number of decoding steps is reduced to significantly speed up the generation speed, while maintaining the generation quality through certain strategies. For details, please refer to [Speculative Decoding](../features/speculative_decoding.md)。
+
+**How to enable:**
+Add the following lines to the startup parameters
+```
+--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}'
+```
+
+#### 2.2.5 W4A8C8 Quantization
+**Idea:**
+Quantization can achieve model compression, reduce GPU memory usage and speed up inference. To achieve better inference results, per-channel symmetric 4-bit quantization is used for MoE weights. static per-tensor symmetric 8-bit quantization is used for activation. And static per-channel symmetric 8-bit quantization is used for KVCache.
+
+**How to enable:**
+Just specify the corresponding model name in the startup command, `baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle`
+```
+--model baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle
+```
+
+#### 2.2.6 Rejection Sampling
+**Idea:**
+Rejection sampling is to generate samples from a proposal distribution that is easy to sample, avoiding explicit sorting to increase the sampling speed, which has a significant improvement on small-sized models.
+
+**How to enable:**
+Add the following environment variables before starting
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+#### 2.2.7 Disaggregated Deployment
+**Idea:** Deploying Prefill and Decode separately in certain scenarios can improve hardware utilization, effectively increase throughput, and reduce overall sentence latency.
+
+**How to enable:** Take the deployment of a single machine with 8 GPUs and 1P1D (4 GPUs each) as an example. Compared with the default hybrid deployment method, `--splitwise-role` is required to specify the role of the node. And the GPUs and logs of the two nodes are isolated through the environment variables `FD_LOG_DIR` and `CUDA_VISIBLE_DEVICES`.
+```
+export FD_LOG_DIR="log_prefill"
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+       --port 8180 --metrics-port 8181 \
+       --engine-worker-queue-port 8182 \
+       --cache-queue-port 8183 \
+       --tensor-parallel-size 4 \
+       --quantization wint4 \
+       --splitwise-role "prefill"
+```
+```
+export FD_LOG_DIR="log_decode"
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+# Note that innode-prefill-ports is specified as the Prefill serviceengine-worker-queue-port
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle\
+       --port 8184 --metrics-port 8185 \
+       --engine-worker-queue-port 8186 \
+       --cache-queue-port 8187 \
+       --tensor-parallel-size 4 \
+       --quantization wint4 \
+       --innode-prefill-ports 8182 \
+       --splitwise-role "decode"
+```
+
+## FAQ
+If you encounter any problems during use, you can refer to [FAQ](./FAQ.md).
--- a/docs/optimal_deployment/FAQ.md
+++ b/docs/optimal_deployment/FAQ.md
@@ -0,0 +1,37 @@
+# FAQ
+## 1.CUDA out of memory
+1. when starting the service：
+- Check the minimum number of deployment GPUs corresponding to the model and quantification method. If it is not met, increase the number of deployment GPUs.
+- If CUDAGraph is enabled, try to reserve more GPU memory for CUDAGraph by lowering `gpu_memory_utilization`, or reduce the GPU memory usage of CUDAGraph by reducing `max_num_seqs` and setting `cudagraph_capture_sizes`。
+
+2. during service operation:
+- Check whether there is information similar to the following in the log. If so, it is usually caused by insufficient output blocks. You need to reduce `kv-cache-ratio`
+```
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 133， encoder block len: 24
+recover seq_id: 2， free_list_len: 144， used_list_len: 134
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 144， encoder_block_len: 24
+```
+
+It is recommended to enable the service management global block. You need add environment variables before starting the service.
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+## 2.Poor model performance
+1. First, check whether the output length meets expectations and whether it is caused by excessive decoding length. If the output is long, please check whether there is similar information as follows in the log. If so, it is usually caused by insufficient output blocks and you need to reduce `kv-cache-ratio`
+```
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 133， encoder block len: 24
+recover seq_id: 2， free_list_len: 144， used_list_len: 134
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 144， encoder_block_len: 24
+```
+
+It is also recommended to enable the service management global block. You need add environment variables before starting the service.
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+2. Check whether the KVCache blocks allocated by the automatic profile are as expected. If the automatic profile is affected by the fluctuation of video memory and may result in less allocation, you can manually set the `num_gpu_blocks_override` parameter to expand the KVCache block.
--- a/docs/usage/kunlunxin_xpu_deployment.md
+++ b/docs/usage/kunlunxin_xpu_deployment.md
@@ -5,14 +5,14 @@
 |ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 4 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
 |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
 |ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
-|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
+|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
 |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
+|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
 |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
 |ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|

--- a/docs/zh/features/early_stop.md
+++ b/docs/zh/features/early_stop.md
@@ -0,0 +1,117 @@
+
+# 早停功能
+
+早停功能用于提前结束模型生成token的过程，具体来说早停功能会采取不同的策略，判断当前生成的token序列是否满足早停条件，如果满足则提前结束token生成。FastDeploy目前支持`Repetition`策略和`Stop Sequence`策略。
+
+## 1.Repetition策略
+* Repetition策略通过检查生成高概率token的次数决定是否需要触发早停功能。
+* 具体来说，当某个batch生成token的概率连续超过用户设置的概率阈值达到用户指定的次数，将提前结束该batch的token生成过程。
+
+### 使用说明
+
+在启动服务时，添加早停功能的启动项。
+
+* 在线推理启动示例：
+  * 使用默认超参数：--enable-early-stop
+    ```shell
+    python -m fastdeploy.entrypoints.openai.api_server \
+        --model baidu/ERNIE-4.5-0.3B-Paddle \
+        --port 8180 \
+        --metrics-port 8181 \
+        --engine-worker-queue-port 8182 \
+        --max-model-len 32768 \
+        --max-num-seqs 32 \
+        --enable-early-stop
+    ```
+  * 使用自定义超参数：--early-stop-config
+    ```shell
+    python -m fastdeploy.entrypoints.openai.api_server \
+          --model baidu/ERNIE-4.5-0.3B-Paddle \
+          --port 8180 \
+          --metrics-port 8181 \
+          --engine-worker-queue-port 8182 \
+          --max-model-len 32768 \
+          --max-num-seqs 32 \
+          --early-stop-config '{"enable_early_stop":true, "window_size": 1000, "threshold": 0.9}'
+    ```
+* 离线推理示例
+  * 使用默认超参数：enable_early_stop
+    ```python
+    from fastdeploy.engine.sampling_params import SamplingParams
+    from fastdeploy.entrypoints.llm import LLM
+
+    model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
+
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
+    llm = LLM(model=model_name_or_path, tensor_parallel_size=1, enable_early_stop=True)
+    output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params)
+
+    print(output)
+    ```
+  * 使用自定义超参数：early_stop_config
+    ```python
+    from fastdeploy.engine.sampling_params import SamplingParams
+    from fastdeploy.entrypoints.llm import LLM
+
+    model_name_or_path = "baidu/ERNIE-4.5-0.3B-Paddle"
+    early_stop_config = {"enable_early_stop":True, "window_size":1000, "threshold":0.9}
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
+    llm = LLM(model=model_name_or_path, tensor_parallel_size=1, early_stop_config=early_stop_config)
+    output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params)
+
+    print(output)
+    ```
+
+### 参数说明
+
+* `enable_early_stop`: (bool) 是否启用早停功能，默认设置为False。
+* `strategy`: (str) 早停功能使用的策略，目前仅支持repetition策略，默认设置为"repetition"。
+* `window_size`: (int) repetition策略中连续出现高概率token的次数上限，超过该次数将触发早停功能，默认设置为3000。
+* `threshold`: (float) repetition策略中的高概率阈值，默认设置为0.99。
+
+## 2.Stop Sequence策略
+* Stop Sequence策略通过检查生成的token序列是否包含用户指定的停止序列决定是否需要触发早停功能。
+* 具体来说，当某个batch生成的token序列中包含用户指定的停止序列时，将提前结束该batch的token生成过程。
+
+### 使用说明
+启动服务前，设置下列环境变量
+```
+FD_STOP_SEQS_MAX_LEN （表示支持停止序列的最大长度，默认为8）
+
+FD_MAX_STOP_SEQS_NUM（表示支持停止序列的最大数量，默认为5）
+```
+在请求服务时，在请求中包含`stop`字段，可以是`str`或`List[str]`。
+
+* 在线推理请求示例，请求时添加stop参数
+```
+# create a chat request with "stop" parameter
+import openai
+ip = "0.0.0.0"
+service_http_port = "8233"
+client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "user", "content": '今天天气真好'},
+    ],
+    temperature=1.0,
+    top_p=0,
+    stream=False,
+    stop=["明天", "出去走走"]
+)
+```
+
+* 离线推理请求，在`SamplingParams`中增加`stop`参数
+```
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.llm import LLM
+
+model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
+
+sampling_params = SamplingParams(temperature=1, top_p=0, stop=["出去走走"])
+llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
+output = llm.chat(messages=[{"role": "user", "content": "今天天气真好"}], use_tqdm=True, sampling_params=sampling_params)
+
+print(output)
+
+```
--- a/docs/zh/features/load_balance.md
+++ b/docs/zh/features/load_balance.md
@@ -56,6 +56,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --port 8801 \
       --metrics-port 8802 \
       --engine-worker-queue-port 8803 \
+       --model baidu/ERNIE-4.5-0.3B-Paddle \
       --scheduler-name global \
       --scheduler-ttl 900 \
       --scheduler-host "127.0.0.1" \
@@ -63,7 +64,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --scheduler-db 0 \
       --scheduler-password "" \
       --scheduler-topic "default" \
-       --scheduler-min-load_score 3 \
+       --scheduler-min-load-score 3 \
       --scheduler-load-shards-num 1
 ```

--- a/docs/zh/features/reasoning_output.md
+++ b/docs/zh/features/reasoning_output.md
@@ -8,18 +8,18 @@
 | baidu/ERNIE-4.5-VL-424B-A47B-Paddle  | ernie-45-vl | ✓       |
 | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl |    ✓    |

-思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式.
+思考模型需要指定解析器,以便于对思考内容进行解析. 通过 `"enable_thinking": false` 参数可以关闭模型思考模式.

 可以支持思考模式开关的接口:
 1. OpenAI 服务中 `/v1/chat/completions`  请求.
 2. OpenAI Python客户端中 `/v1/chat/completions`  请求.
 3. Offline 接口中 `llm.chat`请求.

-同时在思考模型中，支持通过```reasoning_max_tokens```控制思考内容的长度，在请求中添加```metadata={"reasoning_max_tokens": 1024}```即可。
+同时在思考模型中，支持通过 `reasoning_max_tokens` 控制思考内容的长度，在请求中添加 `"reasoning_max_tokens": 1024` 即可。

 ## 快速使用
-在启动模型服务时, 通过`--reasoning-parser`参数指定解析器名称.
-该解析器会解析思考模型的输出, 提取`reasoning_content`字段.
+在启动模型服务时, 通过 `--reasoning-parser` 参数指定解析器名称.
+该解析器会解析思考模型的输出, 提取 `reasoning_content` 字段.

 ```bash
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -43,15 +43,16 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
      {"type": "text", "text": "图中的文物属于哪个年代"}
    ]}
  ],
-  "metadata": {"enable_thinking": true}
+  "chat_template_kwargs":{"enable_thinking": true},
+  "reasoning_max_tokens": 1024
 }'

 ```

-字段`reasoning_content`包含得出最终结论的思考步骤，而`content`字段包含最终结论。
+字段 `reasoning_content` 包含得出最终结论的思考步骤，而 `content` 字段包含最终结论。

 ### 流式会话
-在流式会话中, `reasoning_content`字段会可以在`chat completion response chunks`中的 `delta` 中获取
+在流式会话中, `reasoning_content` 字段会可以在 `chat completion response chunks` 中的 `delta` 中获取

 ```python
 from openai import OpenAI
@@ -69,7 +70,10 @@ chat_response = client.chat.completions.create(
    ],
    model="vl",
    stream=True,
-    metadata={"enable_thinking": True}
+    extra_body={
+      "chat_template_kwargs":{"enable_thinking": True},
+      "reasoning_max_tokens": 1024
+    }
 )
 for chunk in chat_response:
    if chunk.choices[0].delta is not None:
--- a/docs/zh/features/sampling.md
+++ b/docs/zh/features/sampling.md
@@ -0,0 +1,225 @@
+# 采样策略
+
+采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Samping 多种采样策略。
+
+1. Top-p 采样
+
+   * Top-p 采样根据概率累积分布进行截断，仅考虑累计概率达到指定阈值 p 的最可能 token 集合。
+   * 动态选择考虑的 token 数量，保证了结果的多样性，同时避免了不太可能的 token。
+2. Top-k_top-p 采样
+
+   * 首先进行 top-k 采样，然后在 top-k 的结果上进行归一化，再进行 top-p 采样。
+   * 通过限制初始选择范围（top-k）并在其中进行概率累积选择（top-p），提高了生成文本的质量和连贯性。
+3. Min-p 采样
+
+   * Min-p 采样首先计算 pivot=max_prob * min_p，然后只保留概率大于pivot的token(其余设置为0)进行后续的采样。
+   * 用于过滤掉相对概率过低的token，只从高概率token中采样，提高生成质量。
+
+## 使用说明
+
+在部署时，可以通过设置环境变量 `FD_SAMPLING_CLASS` 来选择采样算法。可选择的值有 `base`, `base_non_truncated`, `air`或 `rejection`。
+
+**仅支持 Top-p Sampling 的算法**
+
+* `base`(default)：直接使用 `top_p` 的值进行归一化，倾向于采样概率更大的token。
+* `base_non_truncated`：严格按照 Top-p 采样的逻辑执行，首先选择使累积概率达到 `top_p` 的最小集合，然后对这些选择的元素进行归一化。
+* `air`：该算法参考 [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)的实现，支持 Top-p 采样。
+
+**支持 Top-p 和 Top-k_top-p 采样的算法**
+
+* `rejection`：该算法参考 [flashinfer](https://github.com/flashinfer-ai/flashinfer) 的实现，支持灵活设置 `top_k` 和 `top_p` 参数进行 Top-p 或 Top-k_top-p 采样。
+
+## 配置方式
+
+### Top-p 采样
+
+1. 在部署时，设置环境变量以选择采样算法，默认为base：
+
+```bash
+export FD_SAMPLING_CLASS=rejection  # base, base_non_truncated, or air
+```
+
+2. 在发送请求时，指定top_p参数：
+
+* 使用 curl 命令发送用户请求示例如下：
+
+```bash
+
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "top_p": 0.8
+}'
+```
+
+* 使用 python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+        {"role": "user", "content": "把李白的静夜思改写为现代诗"},
+    ],
+    stream=True,
+    top_p=0.8
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+### Top-k_top-p 采样
+
+1. 在部署时，设置环境变量以选择rejection采样算法：
+
+```bash
+export FD_SAMPLING_CLASS=rejection
+```
+
+2. 在发送请求时，指定以下参数：
+
+* 使用 curl 命令发送用户请求示例如下：
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "top_p": 0.8,
+  "top_k": 20
+}'
+```
+
+* 使用 python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+        {"role": "user", "content": "把李白的静夜思改写为现代诗"},
+    ],
+    stream=True,
+    top_p=0.8,
+    extra_body={"top_k": 20}
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+### Min-p 采样
+
+如果你希望在 top_p 或 top_k_top_p 采样之前使用 min_p 采样，在发送请求时指定以下参数：
+
+* 使用 curl 命令发送用户请求示例如下：
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "min_p": 0.1,
+  "top_p": 0.8,
+  "top_k": 20
+}'
+```
+
+* 使用 python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+        {"role": "user", "content": "把李白的静夜思改写为现代诗"},
+    ],
+    stream=True,
+    top_p=0.8,
+    extra_body={"top_k": 20, "min_p": 0.1}
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+通过上述配置，你可以根据具体的生成任务需求，灵活选择和使用合适的采样策略。
+
+## 参数说明
+
+* `top_p`: 概率累积分布截断阈值，仅考虑累计概率达到此阈值的最可能token集合。float类型，取值范围为[0.0,1.0]。当top_p=1.0时，考虑所有token；当top_p=0.0时，退化为greedy search。
+* `top_k`: 采样概率最高的token数量，考虑概率最高的k个token进行采样范围限制。int类型，取值范围为[0,vocab_size]
+* `min_p`：低概率过滤阈值，仅考虑概率大于等于(max_prob*min_p)的token集合。float类型，取值范围为[0.0,1.0]
+
+# Bad Words
+
+用于在推理过程中禁止模型生成某些特定词，常用于安全控制、内容过滤、模型行为约束等场景。
+
+## 使用说明
+
+请求中加入bad_words参数：
+
+* 使用 curl 命令发送用户请求示例如下：
+
+```bash
+curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "How old are you"}
+  ],
+  "bad_words": ["age", "I"]
+}'
+```
+
+* 使用 python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+    ],
+    extra_body={"bad_words": ["you", "me"]},
+    stream=True,
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
+
+## 参数说明
+
+* `bad_words`: 禁止生成的词列表。list类型，每个元素为str类型。仅支持每个元素为单个token。
--- a/docs/zh/get_started/ernie-4.5-vl.md
+++ b/docs/zh/get_started/ernie-4.5-vl.md
@@ -110,7 +110,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
      {"type": "text", "text": "图中的文物属于哪个年代"}
    ]}
  ],
-  "metadata": {"enable_thinking": false}
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```

--- a/docs/zh/get_started/installation/kunlunxin_xpu.md
+++ b/docs/zh/get_started/installation/kunlunxin_xpu.md
@@ -5,7 +5,7 @@
 - OS：Linux
 - Python：3.10
 - XPU 型号：P800
- XPU 驱动版本：≥ 5.0.21.10
+- XPU 驱动版本：≥ 5.0.21.26
 - XPU 固件版本：≥ 1.31

 已验证的平台：
@@ -15,7 +15,7 @@
 - OS：CentOS release 7.6 (Final)
 - Python：3.10
 - XPU 型号：P800（OAM 版）
- XPU 驱动版本：5.0.21.10
+- XPU 驱动版本：5.0.21.26
 - XPU 固件版本：1.31

 **注：** 目前只验证过 INTEL 或海光 CPU OAM 版 P800 服务器，暂未验证其它 CPU 和 PCIe 版 P800 服务器。
@@ -25,9 +25,9 @@
 ```bash
 mkdir Work
 cd Work
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
-    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0 \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \
    /bin/bash
 docker exec -it fastdeploy-xpu /bin/bash
 ```
@@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash
 ### 安装 PaddlePaddle

 ```bash
-python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```

 或者您也可以安装最新版 PaddlePaddle（不推荐）
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
 ### 安装 FastDeploy（**注意不要通过 pypi 源安装**）

 ```bash
-python -m pip install fastdeploy-xpu==2.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 ```

 或者你也可以安装最新版 FastDeploy（不推荐）
@@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa
 ### 安装 PaddlePaddle

 ```bash
-python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```

 或者您也可以安装最新版 PaddlePaddle（不推荐）
--- a/docs/zh/get_started/quick_start_vl.md
+++ b/docs/zh/get_started/quick_start_vl.md
@@ -73,7 +73,7 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
      {"type": "text", "text": "图中的文物属于哪个年代"}
    ]}
  ],
-  "metadata": {"enable_thinking": false}
+  "chat_template_kwargs":{"enable_thinking": false}
 }'
 ```

@@ -93,7 +93,7 @@ response = client.chat.completions.create(
            {"type": "text", "text": "图中的文物属于哪个年代?"},
        ]},
    ],
-    metadata={"enable_thinking": false},
+    extra_body={"enable_thinking": false},
    stream=True,
 )
 for chunk in response:
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -2,12 +2,12 @@

 **FastDeploy** 是基于飞桨（PaddlePaddle）的大语言模型（LLM）与视觉语言模型（VLM）推理部署工具包，提供**开箱即用的生产级部署方案**，核心技术特性包括：

-🚀 **负载均衡式PD分解**：工业级解决方案，支持上下文缓存与动态实例角色切换，在保障SLO达标和吞吐量的同时优化资源利用率
-🔄 **统一KV缓存传输**：轻量级高性能传输库，支持智能NVLink/RDMA选择
-🤝 **OpenAI API服务与vLLM兼容**：单命令部署，兼容[vLLM](https://github.com/vllm-project/vllm/)接口
-🧮 **全量化格式支持**：W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
-⏩ **高级加速技术**：推测解码、多令牌预测（MTP）及分块预填充
-🖥️ **多硬件支持**：NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等
+- 🚀 **负载均衡式PD分解**：工业级解决方案，支持上下文缓存与动态实例角色切换，在保障SLO达标和吞吐量的同时优化资源利用率
+- 🔄 **统一KV缓存传输**：轻量级高性能传输库，支持智能NVLink/RDMA选择
+- 🤝 **OpenAI API服务与vLLM兼容**：单命令部署，兼容[vLLM](https://github.com/vllm-project/vllm/)接口
+- 🧮 **全量化格式支持**：W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
+- ⏩ **高级加速技术**：推测解码、多令牌预测（MTP）及分块预填充
+- 🖥️ **多硬件支持**：NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等

 ## 支持模型

--- a/docs/zh/offline_inference.md
+++ b/docs/zh/offline_inference.md
@@ -183,6 +183,7 @@ for output in outputs:
 * min_p(float): token入选的最小概率阈值(相对于最高概率token的比值，设为>0可通过过滤低概率token来提升文本生成质量)
 * max_tokens(int): 限制模型生成的最大token数量（包括输入和输出）
 * min_tokens(int): 强制模型生成的最少token数量，避免过早结束
+* bad_words(list[str]): 禁止生成的词列表, 防止模型生成不希望出现的词

 ### 2.5 fastdeploy.engine.request.RequestOutput

--- a/docs/zh/online_serving/README.md
+++ b/docs/zh/online_serving/README.md
@@ -21,9 +21,10 @@ python -m fastdeploy.entrypoints.openai.api_server \

 服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。

-## 发送用户请求
+## Chat Completion API
+FastDeploy 接口兼容 OpenAI 的 Chat Completion API，用户可以通过 OpenAI 协议发送用户请求。

-FastDeploy 接口兼容 OpenAI 协议，可以直接使用 OpenAI 的请求方式发送用户请求。
+### 发送用户请求

 使用 curl 命令发送用户请求示例如下：

@@ -71,29 +72,124 @@ for chunk in response:
 print('\n')
 ```

-关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。
+关于 OpenAI 协议的说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create)。

-## 参数差异
-### 请求参数差异
-FastDeploy 与 OpenAI 协议的请求参数差异如下，其余请求参数会被忽略：
- `prompt` (仅支持 `v1/completions` 接口)
- `messages` (仅支持 `v1/chat/completions` 接口)
- `logprobs`: Optional[bool] = False (仅支持 `v1/chat/completions` 接口)
- `top_logprobs`: Optional[int] = None (仅支持 `v1/chat/completions` 接口。如果使用这个参数必须设置logprobs为True，取值大于等于0小于20)
- `frequency_penalty`: Optional[float] = 0.0
- `max_tokens`: Optional[int] = 16
- `presence_penalty`: Optional[float] = 0.0
- `stream`: Optional[bool] = False
- `stream_options`: Optional[StreamOptions] = None
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `metadata`: Optional[dict] = None (仅在v1/chat/compeltions中支持，用于配置额外参数, 如metadata={"enable_thinking": True})
-  - `min_tokens`: Optional[int] = 1 最小生成的Token个数
-  - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数，默认与max_tokens一致
-  - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
-  - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数（>1时惩罚重复，<1时鼓励重复）
+### 兼容OpenAI 参数
+```python
+messages: Union[List[Any], List[int]]
+# 输入消息列表，可以是文本消息（`List[Any]`，通常为 `List[dict]`）或 token ID 列表（`List[int]`）。

-> 注: 若为多模态模型 由于思考链默认打开导致输出过长，max tokens 可以设置为模型最长输出，或使用默认值。
+tools: Optional[List[ChatCompletionToolsParam]] = None
+# 工具调用配置列表，用于启用函数调用（Function Calling）或工具使用（如 ReAct 框架）。
+
+model: Optional[str] = "default"
+# 指定使用的模型名称或版本，默认值为 `"default"`（可能指向基础模型）。
+
+frequency_penalty: Optional[float] = None
+# 频率惩罚系数，降低重复生成相同 token 的概率（`>1.0` 抑制重复，`<1.0` 鼓励重复，默认 `None` 禁用）。
+
+logprobs: Optional[bool] = False
+# 是否返回每个生成 token 的对数概率（log probabilities），用于调试或分析。
+
+top_logprobs: Optional[int] = 0
+# 返回每个生成位置概率最高的 `top_logprobs` 个 token 及其对数概率（默认 `0` 表示不返回）。
+
+max_tokens: Optional[int] = Field(
+    default=None,
+    deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+)
+# 已弃用：生成的最大 token 数（建议改用 `max_completion_tokens`）。
+
+max_completion_tokens: Optional[int] = None
+# 生成的最大 token 数（推荐替代 `max_tokens`），默认无限制（受模型上下文窗口限制）。
+
+presence_penalty: Optional[float] = None
+# 存在惩罚系数，降低新主题（未出现过的话题）的生成概率（`>1.0` 抑制新话题，`<1.0` 鼓励新话题，默认 `None` 禁用）。
+
+stream: Optional[bool] = False
+# 是否启用流式输出（逐 token 返回结果），默认 `False`（一次性返回完整结果）。
+
+stream_options: Optional[StreamOptions] = None
+# 流式输出的额外配置（如分块大小、超时等），需参考 `StreamOptions` 的具体定义。
+
+temperature: Optional[float] = None
+# 温度系数，控制生成随机性（`0.0` 确定性生成，`>1.0` 更随机，默认 `None` 使用模型默认值）。
+
+top_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 `top_p` 的 token（默认 `None` 禁用）。
+
+response_format: Optional[AnyResponseFormat] = None
+# 指定输出格式（如 JSON、XML 等），需传入预定义的格式配置对象。
+
+user: Optional[str] = None
+# 用户标识符，用于跟踪或区分不同用户的请求（默认 `None` 不传递）。
+
+metadata: Optional[dict] = None
+# 附加元数据，用于传递自定义信息（如请求 ID、调试标记等）。
+
+```
+
+### FastDeploy 增加额外参数
+
+> 注：
+使用 curl 命令发送请求时， 可以直接使用以下参数；
+使用openai.Client 发送请求时，需要使用将以下参数放入 `extra_body` 参数中， 如：`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。
+
+额外采样参数的支持如下：
+```python
+top_k: Optional[int] = None
+# 限制每一步生成时只考虑概率最高的 K 个 token，用于控制随机性（默认 None 表示不限制）。
+
+min_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 min_p 的 token（默认 None 表示禁用）。
+
+min_tokens: Optional[int] = None
+# 强制生成的最小 token 数，避免过早截断（默认 None 表示不限制）。
+
+include_stop_str_in_output: Optional[bool] = False
+# 是否在输出中包含停止符（stop string）的内容（默认 False，即遇到停止符时截断输出）。
+
+bad_words: Optional[List[str]] = None
+# 禁止生成的词汇列表（例如敏感词），模型会避免输出这些词（默认 None 表示不限制）。
+
+repetition_penalty: Optional[float] = None
+# 重复惩罚系数，降低已生成 token 的重复概率（>1.0 抑制重复，<1.0 鼓励重复，默认 None 表示禁用）。
+```
+其他参数的支持如下：
+```python
+chat_template_kwargs: Optional[dict] = None
+# 传递给聊天模板（chat template）的额外参数，用于自定义对话格式（默认 None）。
+
+reasoning_max_tokens: Optional[int] = None
+# 推理（如 CoT, 思维链）过程中生成的最大 token 数（默认 None 表示使用全局 max_tokens）。
+
+structural_tag: Optional[str] = None
+# 结构化标签，用于标记生成内容的特定结构（如 JSON、XML 等，默认 None）。
+
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# 引导生成符合 JSON 结构的内容，可以是 JSON 字符串、字典或 Pydantic 模型（默认 None）。
+
+guided_regex: Optional[str] = None
+# 引导生成符合正则表达式规则的内容（默认 None 表示不限制）。
+
+guided_choice: Optional[List[str]] = None
+# 引导生成内容从指定的候选列表中选择（默认 None 表示不限制）。
+
+guided_grammar: Optional[str] = None
+# 引导生成符合语法规则（如 BNF）的内容（默认 None 表示不限制）。
+
+return_token_ids: Optional[bool] = None
+# 是否返回生成结果的 token ID 而非文本（默认 None 表示返回文本）。
+
+prompt_token_ids: Optional[List[int]] = None
+# 直接传入 prompt 的 token ID 列表，跳过文本编码步骤（默认 None 表示使用文本输入）。
+
+max_streaming_response_tokens: Optional[int] = None
+# 流式输出时每次返回的最大 token 数（默认 None 表示不限制）。
+
+disable_chat_template: Optional[bool] = False
+# 是否禁用聊天模板渲染，直接使用原始输入（默认 False 表示启用模板）。
+```

 ### 返回字段差异

@@ -101,24 +197,202 @@ FastDeploy 增加的返回字段如下：

 - `arrival_time`：返回所有 token 的累计耗时
 - `reasoning_content`: 思考链的返回结果
+- `prompt_token_ids`: 输入序列的 token id 列表
+- `completion_token_ids`: 输出序列的 token id 列表

 返回参数总览：

 ```python
+
+ChatCompletionResponse:
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+ChatCompletionResponseChoice:
+    index: int
+    message: ChatMessage
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
+ChatMessage:
+    role: str
+    content: str
+    reasoning_content: Optional[str] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+
+# 返回流式响应的字段
 ChatCompletionStreamResponse:
    id: str
    object: str = "chat.completion.chunk"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[ChatCompletionResponseStreamChoice]
- ChatCompletionResponseStreamChoice:
+    usage: Optional[UsageInfo] = None
+ChatCompletionResponseStreamChoice:
    index: int
    delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
    arrival_time: Optional[float] = None
 DeltaMessage:
    role: Optional[str] = None
    content: Optional[str] = None
-    token_ids: Optional[List[int]] = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
    reasoning_content: Optional[str] = None
 ```
+
+## Completion API
+Completion API 接口主要用于续聊场景, 适应于用户自定义好上下文输入, 并希望模型仅输出续写内容的场景; 推理过程不会增加其他 `prompt`拼接。：
+
+### 发送用户请求
+
+使用 curl 命令发送用户请求示例如下：
+
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "prompt": "以下是一篇关于深圳文心公园的500字游记和赏析："
+}'
+```
+
+使用 Python 脚本发送用户请求示例如下：
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8170"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.completions.create(
+    model="default",
+    prompt="以下是一篇关于深圳文心公园的500字游记和赏析：",
+    stream=False,
+)
+print(response.choices[0].text)
+```
+
+关于 OpenAI 协议的说明可参考文档 [OpenAI Completion API](https://platform.openai.com/docs/api-reference/completions/create)。
+
+### 兼容OpenAI 参数
+```python
+model: Optional[str] = "default"
+# 指定使用的模型名称或版本，默认值为 `"default"`（可能指向基础模型）。
+
+prompt: Union[List[int], List[List[int]], str, List[str]]
+# 输入提示，支持多种格式：
+#   - `str`: 纯文本提示（如 `"Hello, how are you?"`）。
+#   - `List[str]`: 多段文本（如 `["User:", "Hello!", "Assistant:", "Hi!"]`）。
+#   - `List[int]`: 直接传入 token ID 列表（如 `[123, 456]`）。
+#   - `List[List[int]]`: 多段 token ID 列表（如 `[[123], [456, 789]]`）。
+
+best_of: Optional[int] = None
+# 生成 `best_of` 个候选结果，然后返回其中评分最高的一个（需配合 `n=1` 使用）。
+
+frequency_penalty: Optional[float] = None
+# 频率惩罚系数，降低重复生成相同 token 的概率（`>1.0` 抑制重复，`<1.0` 鼓励重复）。
+
+logprobs: Optional[int] = None
+# 返回每个生成 token 的对数概率（log probabilities），可指定返回的候选数量。
+
+max_tokens: Optional[int] = None
+# 生成的最大 token 数（包括输入和输出），默认无限制（受模型上下文窗口限制）。
+
+presence_penalty: Optional[float] = None
+# 存在惩罚系数，降低新主题（未出现过的话题）的生成概率（`>1.0` 抑制新话题，`<1.0` 鼓励新话题）。
+```
+
+### FastDeploy 增加额外参数
+
+> 注：
+使用 curl 命令发送请求时， 可以直接使用以下参数；
+使用openai.Client 发送请求时，需要使用将以下参数放入 `extra_body` 参数中， 如：`extra_body={"chat_template_kwargs": {"enable_thinking":True}, "include_stop_str_in_output": True}`。
+
+额外采样参数的支持如下：
+```python
+top_k: Optional[int] = None
+# 限制每一步生成时只考虑概率最高的 K 个 token，用于控制随机性（默认 None 表示不限制）。
+
+min_p: Optional[float] = None
+# 核采样（nucleus sampling）阈值，只保留概率累计超过 min_p 的 token（默认 None 表示禁用）。
+
+min_tokens: Optional[int] = None
+# 强制生成的最小 token 数，避免过早截断（默认 None 表示不限制）。
+
+include_stop_str_in_output: Optional[bool] = False
+# 是否在输出中包含停止符（stop string）的内容（默认 False，即遇到停止符时截断输出）。
+
+bad_words: Optional[List[str]] = None
+# 禁止生成的词汇列表（例如敏感词），模型会避免输出这些词（默认 None 表示不限制）。
+
+repetition_penalty: Optional[float] = None
+# 重复惩罚系数，降低已生成 token 的重复概率（>1.0 抑制重复，<1.0 鼓励重复，默认 None 表示禁用）。
+```
+其他参数的支持如下：
+```python
+guided_json: Optional[Union[str, dict, BaseModel]] = None
+# 引导生成符合 JSON 结构的内容，可以是 JSON 字符串、字典或 Pydantic 模型（默认 None）。
+
+guided_regex: Optional[str] = None
+# 引导生成符合正则表达式规则的内容（默认 None 表示不限制）。
+
+guided_choice: Optional[List[str]] = None
+# 引导生成内容从指定的候选列表中选择（默认 None 表示不限制）。
+
+guided_grammar: Optional[str] = None
+# 引导生成符合语法规则（如 BNF）的内容（默认 None 表示不限制）。
+
+return_token_ids: Optional[bool] = None
+# 是否返回生成结果的 token ID 而非文本（默认 None 表示返回文本）。
+
+prompt_token_ids: Optional[List[int]] = None
+# 直接传入 prompt 的 token ID 列表，跳过文本编码步骤（默认 None 表示使用文本输入）。
+
+max_streaming_response_tokens: Optional[int] = None
+# 流式输出时每次返回的最大 token 数（默认 None 表示不限制）。
+```
+
+### 返回参数总览
+
+```python
+
+CompletionResponse:
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+CompletionResponseChoice:
+    index: int
+    text: str
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    arrival_time: Optional[float] = None
+    logprobs: Optional[int] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
+
+# 返回流式响应的字段
+CompletionStreamResponse：
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+CompletionResponseStreamChoice:
+    index: int
+    text: str
+    arrival_time: float = None
+    prompt_token_ids: Optional[List[int]] = None
+    completion_token_ids: Optional[List[int]] = None
+    logprobs: Optional[float] = None
+    reasoning_content: Optional[str] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+
+```
--- a/docs/zh/optimal_deployment/ERNIE-4.5-0.3B-Paddle.md
+++ b/docs/zh/optimal_deployment/ERNIE-4.5-0.3B-Paddle.md
@@ -0,0 +1,93 @@
+# ERNIE-4.5-0.3B
+## 一、环境准备
+### 1.1 支持情况
+ERNIE-4.5-0.3B 各量化精度，在下列硬件上部署所需要的最小卡数如下：
+|  | WINT8 | WINT4 | FP8 |
+|-----|-----|-----|-----|
+|H800 80GB| 1 | 1 | 1 |
+|A800 80GB| 1 | 1 | / |
+|H20 96GB| 1 | 1 | 1 |
+|L20 48GB| 1 | 1 | 1 |
+|A30 40GB| 1 | 1 | / |
+|A10 24GB| 1 | 1 | / |
+
+**注：**
+1. 在启动命令后指定`--tensor-parallel-size 1` 即可修改部署卡数
+2. 表格中未列出的硬件，可根据显存大小进行预估是否可以部署
+
+### 1.2 安装fastdeploy
+- 安装请参考[Fastdeploy Installation](../get_started/installation/README.md)完成安装。
+
+- 模型下载，请参考[支持模型列表](../supported_models.md)。**请注意使用Fastdeploy部署需要Paddle后缀的模型**
+
+## 二、如何使用
+### 2.1 基础：启动服务
+通过下列命令启动服务
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-0.3B-Paddle \
+       --tensor-parallel-size 1 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+其中：
+- `--quantization`: 表示模型采用的量化策略。不同量化策略，模型的性能和精度也会不同。可选值包括：`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
+- `--max-model-len`：表示当前部署的服务所支持的最长Token数量。设置得越大，模型可支持的上下文长度也越大，但相应占用的显存也越多，可能影响并发数。
+
+更多的参数含义与默认设置，请参见[FastDeploy参数说明](../parameters.md)。
+
+### 2.2 进阶：如何获取更优性能
+#### 2.2.1 评估应用场景，正确设置参数
+结合应用场景，评估平均输入长度、平均输出长度、最大上下文长度。例如，平均输入长度为1000，输出长度为30000，那么建议设置为 32768
+- 根据最大上下文长度，设置`max-model-len`
+- **启用服务管理全局 Block**
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**原理：** Prefix Caching的核心思想是通过缓存输入序列的中间计算结果（KV Cache），避免重复计算，从而加速具有相同前缀的多个请求的响应速度。具体参考[prefix-cache](../features/prefix_caching.md)
+
+**启用方式：**
+在启动参数下增加下列两行，其中`--enable-prefix-caching`表示启用前缀缓存，`--swap-space`表示在GPU缓存的基础上，额外开启CPU缓存，大小为GB，应根据机器实际情况调整。
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**原理：** 采用分块策略，将预填充（Prefill）阶段请求拆解为小规模子任务，与解码（Decode）请求混合批处理执行。可以更好地平衡计算密集型（Prefill）和访存密集型（Decode）操作，优化GPU资源利用率，减少单次Prefill的计算量和显存占用，从而降低显存峰值，避免显存不足的问题。 具体请参考[Chunked Prefill](../features/chunked_prefill.md)
+
+**启用方式：** 在启动参数下增加即可
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 CUDAGraph
+**原理：**
+CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术，通过将 CUDA 操作序列捕获（capture）为图结构（graph），实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图，从而减少 CPU-GPU 通信开销、降低内核启动延迟，并提升整体计算性能。
+
+**启用方式：**
+在启动命令中增加
+```
+--use-cudagraph
+```
+注：
+1. 通常情况下不需要额外设置其他参数，但CUDAGraph会产生一些额外的显存开销，在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../parameters.md) 相关配置参数说明
+2. 开启CUDAGraph时，暂时只支持单卡推理，即`--tensor-parallel-size 1`
+3. 开启CUDAGraph时，暂时不支持同时开启`Chunked Prefill`和`Prefix Caching`
+
+#### 2.2.5 拒绝采样
+**原理：**
+拒绝采样即从一个易于采样的提议分布（proposal distribution）中生成样本，避免显式排序从而达到提升采样速度的效果，对小尺寸的模型有较明显的提升。
+
+**启用方式：**
+启动前增加下列环境变量
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+## 三、常见问题FAQ
+如果您在使用过程中遇到问题，可以在[FAQ](./FAQ.md)中查阅。
--- a/docs/zh/optimal_deployment/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/zh/optimal_deployment/ERNIE-4.5-21B-A3B-Paddle.md
@@ -0,0 +1,149 @@
+# ERNIE-4.5-21B-A3B
+## 一、环境准备
+### 1.1 支持情况
+ERNIE-4.5-21B-A3B 各量化精度，在下列硬件上部署所需要的最小卡数如下：
+|  | WINT8 | WINT4 | FP8 |
+|-----|-----|-----|-----|
+|H800 80GB| 1 | 1 | 1 |
+|A800 80GB| 1 | 1 | / |
+|H20 96GB| 1 | 1 | 1 |
+|L20 48GB| 1 | 1 | 1 |
+|A30 40GB| 2 | 1 | / |
+|A10 24GB| 2 | 1 | / |
+
+**注：**
+1. 在启动命令后指定`--tensor-parallel-size 2` 即可修改部署卡数
+2. 表格中未列出的硬件，可根据显存大小进行预估是否可以部署
+
+### 1.2 安装fastdeploy
+- 安装，请参考[Fastdeploy Installation](../get_started/installation/README.md)完成安装。
+
+- 模型下载，请参考[支持模型列表](../supported_models.md)。**请注意使用Fastdeploy部署需要Paddle后缀的模型**
+
+## 二、如何使用
+### 2.1 基础：启动服务
+通过下列命令启动服务
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+       --tensor-parallel-size 1 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+其中：
+- `--quantization`: 表示模型采用的量化策略。不同量化策略，模型的性能和精度也会不同。可选值包括：`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
+- `--max-model-len`：表示当前部署的服务所支持的最长Token数量。设置得越大，模型可支持的上下文长度也越大，但相应占用的显存也越多，可能影响并发数。
+
+更多的参数含义与默认设置，请参见[FastDeploy参数说明](../parameters.md)。
+
+### 2.2 进阶：如何获取更优性能
+#### 2.2.1 评估应用场景，正确设置参数
+结合应用场景，评估平均输入长度、平均输出长度、最大上下文长度。例如，平均输入长度为1000，输出长度为30000，那么建议设置为 32768
+- 根据最大上下文长度，设置`max-model-len`
+- **启用服务管理全局 Block**
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**原理：** Prefix Caching的核心思想是通过缓存输入序列的中间计算结果（KV Cache），避免重复计算，从而加速具有相同前缀的多个请求的响应速度。具体参考[prefix-cache](../features/prefix_caching.md)
+
+**启用方式：**
+在启动参数下增加下列两行，其中`--enable-prefix-caching`表示启用前缀缓存，`--swap-space`表示在GPU缓存的基础上，额外开启CPU缓存，大小为GB，应根据机器实际情况调整。
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**原理：** 采用分块策略，将预填充（Prefill）阶段请求拆解为小规模子任务，与解码（Decode）请求混合批处理执行。可以更好地平衡计算密集型（Prefill）和访存密集型（Decode）操作，优化GPU资源利用率，减少单次Prefill的计算量和显存占用，从而降低显存峰值，避免显存不足的问题。 具体请参考[Chunked Prefill](../features/chunked_prefill.md)
+
+**启用方式：** 在启动参数下增加即可
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 MTP (Multi-Token Prediction)
+**原理：**
+通过一次性预测多个Token，减少解码步数，以显著加快生成速度，同时通过一定策略保持生成质量。具体请参考[投机解码](../features/speculative_decoding.md)。
+
+**启用方式：**
+在启动参数下增加即可
+```
+--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}'
+```
+
+#### 2.2.5 CUDAGraph
+**原理：**
+CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术，通过将 CUDA 操作序列捕获（capture）为图结构（graph），实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图，从而减少 CPU-GPU 通信开销、降低内核启动延迟，并提升整体计算性能。
+
+**启用方式：**
+在启动命令中增加
+```
+--use-cudagraph
+```
+注：
+1. 通常情况下不需要额外设置其他参数，但CUDAGraph会产生一些额外的显存开销，在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../parameters.md) 相关配置参数说明
+2. 开启CUDAGraph时，暂时只支持单卡推理，即`--tensor-parallel-size 1`
+3. 开启CUDAGraph时，暂时不支持同时开启`Chunked Prefill`和`Prefix Caching`
+
+#### 2.2.6 拒绝采样
+**原理：**
+拒绝采样即从一个易于采样的提议分布（proposal distribution）中生成样本，避免显式排序从而达到提升采样速度的效果，对小尺寸的模型有较明显的提升。
+
+**启用方式：**
+启动前增加下列环境变量
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+#### 2.2.7 分离式部署
+**原理：** 分离式部署的核心思想是将Prefill 和 Decode 分开部署，在一定场景下可以提高硬件利用率，有效提高吞吐，降低整句时延。具体请参考分离式部署
+
+**启用方式：** 以单机8GPU，1P1D（各4GPU）部署为例，与默认的混合式部署方式相比， 需要`--splitwise-role`指定节点的角色。并通过环境变量`FD_LOG_DIR`和`CUDA_VISIBLE_DEVICES`将两个节点的GPU 和日志隔离开
+```
+# prefill
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export INFERENCE_MSG_QUEUE_ID=1315
+export FLAGS_max_partition_size=2048
+export FD_ATTENTION_BACKEND=FLASH_ATTN
+export FD_LOG_DIR="prefill_log"
+
+quant_type=block_wise_fp8
+export FD_USE_DEEP_GEMM=0
+
+python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+    --max-model-len 131072 \
+    --max-num-seqs 20 \
+    --num-gpu-blocks-override 40000 \
+    --quantization ${quant_type} \
+    --gpu-memory-utilization 0.9 --kv-cache-ratio 0.9 \
+    --port 7012 --engine-worker-queue-port 7013 --metrics-port 7014 --tensor-parallel-size 4 \
+    --cache-queue-port 7015 \
+    --splitwise-role "prefill" \
+```
+```
+# decode
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+export INFERENCE_MSG_QUEUE_ID=1215
+export FLAGS_max_partition_size=2048
+export FD_LOG_DIR="decode_log"
+
+quant_type=block_wise_fp8
+export FD_USE_DEEP_GEMM=0
+
+python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
+    --max-model-len 131072 \
+    --max-num-seqs 20 \
+    --quantization ${quant_type} \
+    --gpu-memory-utilization 0.85 --kv-cache-ratio 0.1 \
+    --port 9012 --engine-worker-queue-port 8013 --metrics-port 8014 --tensor-parallel-size 4 \
+    --cache-queue-port 8015 \
+    --innode-prefill-ports 7013 \
+    --splitwise-role "decode"
+```
+
+## 三、常见问题FAQ
+如果您在使用过程中遇到问题，可以在[FAQ](./FAQ.md)中查阅。
--- a/docs/zh/optimal_deployment/ERNIE-4.5-300B-A47B-Paddle.md
+++ b/docs/zh/optimal_deployment/ERNIE-4.5-300B-A47B-Paddle.md
@@ -0,0 +1,128 @@
+# ERNIE-4.5-300B-A47B
+## 一、环境准备
+### 1.1 支持情况
+ERNIE-4.5-300B-A47B各量化精度，在下列硬件上部署所需要的最小卡数如下：
+|  | WINT8 | WINT4 | FP8 | WINT2 | W4A8 |
+|-----|-----|-----|-----|-----|-----|
+|H800 80GB| 8 | 4 | 8 | 2 | 4 |
+|A800 80GB| 8 | 4 | / | 2 | 4 |
+
+**注：**
+1. 在启动命令后指定`--tensor-parallel-size 4`即可修改部署卡数
+2. 由于仅提供4卡量化scale，W4A8模型需部署在4卡
+3. 表格中未列出的硬件，可根据显存大小进行预估是否可以部署
+
+### 1.2 安装fastdeploy
+- 安装，请参考[Fastdeploy Installation](../get_started/installation/README.md)完成安装。
+
+- 模型下载，请参考[支持模型列表](../supported_models.md)。**请注意使用Fastdeploy部署需要Paddle后缀的模型**
+
+## 二、如何使用
+### 2.1 基础：启动服务
+通过下列命令启动服务
+```bash
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+       --tensor-parallel-size 8 \
+       --quantization wint4 \
+       --max-model-len 32768 \
+       --kv-cache-ratio 0.75 \
+       --max-num-seqs 128
+```
+其中：
+- `--quantization`: 表示模型采用的量化策略。不同量化策略，模型的性能和精度也会不同。可选值包括：`wint8` / `wint4` / `block_wise_fp8`(需要Hopper架构)。
+- `--max-model-len`：表示当前部署的服务所支持的最长Token数量。设置得越大，模型可支持的上下文长度也越大，但相应占用的显存也越多，可能影响并发数。
+
+更多的参数含义与默认设置，请参见[FastDeploy参数说明](../parameters.md)。
+
+### 2.2 进阶：如何获取更优性能
+#### 2.2.1 评估应用场景，正确设置参数
+结合应用场景，评估平均输入长度、平均输出长度、最大上下文长度
+- 根据最大上下文长度，设置`max-model-len`。例如，平均输入长度为1000，输出长度为30000，那么建议设置为 32768
+- **启用服务管理全局 Block**
+
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+#### 2.2.2 Prefix Caching
+**原理：** Prefix Caching的核心思想是通过缓存输入序列的中间计算结果（KV Cache），避免重复计算，从而加速具有相同前缀的多个请求的响应速度。具体参考[prefix-cache](../features/prefix_caching.md)
+
+**启用方式：**
+在启动参数下增加下列两行，其中`--enable-prefix-caching`表示启用前缀缓存，`--swap-space`表示在GPU缓存的基础上，额外开启CPU缓存，大小为GB，应根据机器实际情况调整。
+```
+--enable-prefix-caching
+--swap-space 50
+```
+
+#### 2.2.3 Chunked Prefill
+**原理：** 采用分块策略，将预填充（Prefill）阶段请求拆解为小规模子任务，与解码（Decode）请求混合批处理执行。可以更好地平衡计算密集型（Prefill）和访存密集型（Decode）操作，优化GPU资源利用率，减少单次Prefill的计算量和显存占用，从而降低显存峰值，避免显存不足的问题。 具体请参考[Chunked Prefill](../features/chunked_prefill.md)
+
+**启用方式：** 在启动参数下增加即可
+```
+--enable-chunked-prefill
+```
+
+#### 2.2.4 MTP (Multi-Token Prediction)
+**原理：**
+通过一次性预测多个Token，减少解码步数，以显著加快生成速度，同时通过一定策略保持生成质量。具体请参考[投机解码](../features/speculative_decoding.md)。
+
+**启用方式：**
+在启动参数下增加即可
+```
+--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}'
+```
+
+#### 2.2.5 W4A8C8量化
+**原理：**
+量化可以实现模型的压缩，减少显存占用并加快推理计算速度。对模型MOE部分权重使用per-channel对称4比特量化，激活使用静态per-tensor对称8比特量化，KVCache使用静态per-channel对称8比特量化。以实现更优的推理效果。
+
+**启用方式：**
+需要在启动命令中指定对应的模型名称，`baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle`
+```
+--model baidu/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle
+```
+
+#### 2.2.6 拒绝采样
+**原理：**
+拒绝采样即从一个易于采样的提议分布（proposal distribution）中生成样本，避免显式排序从而达到提升采样速度的效果，对小尺寸的模型有较明显的提升。
+
+**启用方式：**
+启动前增加下列环境变量
+```
+export FD_SAMPLING_CLASS=rejection
+```
+
+#### 2.2.7 分离式部署
+**原理：** 分离式部署的核心思想是将Prefill 和 Decode 分开部署，在一定场景下可以提高硬件利用率，有效提高吞吐，降低整句时延。具体请参考分离式部署
+
+**启用方式：** 以单机8GPU，1P1D（各4GPU）部署为例，与默认的混合式部署方式相比， 需要`--splitwise-role`指定节点的角色。并通过环境变量`FD_LOG_DIR`和`CUDA_VISIBLE_DEVICES`将两个节点的GPU 和日志隔离开
+```
+export FD_LOG_DIR="log_prefill"
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle \
+       --port 8180 --metrics-port 8181 \
+       --engine-worker-queue-port 8182 \
+       --cache-queue-port 8183 \
+       --tensor-parallel-size 4 \
+       --quantization wint4 \
+       --splitwise-role "prefill"
+```
+```
+export FD_LOG_DIR="log_decode"
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+# 注意innode-prefill-ports指定为Prefill服务的engine-worker-queue-port
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-300B-A47B-Paddle\
+       --port 8184 --metrics-port 8185 \
+       --engine-worker-queue-port 8186 \
+       --cache-queue-port 8187 \
+       --tensor-parallel-size 4 \
+       --quantization wint4 \
+       --innode-prefill-ports 8182 \
+       --splitwise-role "decode"
+```
+
+## 三、常见问题FAQ
+如果您在使用过程中遇到问题，可以在[FAQ](./FAQ.md)中查阅。
--- a/docs/zh/optimal_deployment/FAQ.md
+++ b/docs/zh/optimal_deployment/FAQ.md
@@ -0,0 +1,37 @@
+# 常见问题FAQ
+## 1.显存不足
+1. 启动服务时显存不足：
+- 核对模型和量化方式对应的部署最小卡数，如果不满足则需要增加部署卡数
+- 如果开启了CUDAGraph，尝试通过降低 `gpu_memory_utilization`来为CUDAGraph留存更多的显存，或通过减少 `max_num_seqs`，设置`cudagraph_capture_sizes`来减少CUDAGraph的显存占用。
+
+2. 服务运行期间显存不足：
+- 检查log中是否有类似如下信息，如有，通常是输出block不足导致，需要减小`kv-cache-ratio`
+```
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 133， encoder block len: 24
+recover seq_id: 2， free_list_len: 144， used_list_len: 134
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 144， encoder_block_len: 24
+```
+
+建议启用服务管理全局 Block功能，在启动服务前，加入环境变量
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+## 2.模型性能差
+1. 首先检查输出长度是否符合预期，是否是解码过长导致。
+如果场景输出本身较长，请检查log中是否有类似如下信息，如有，通常是输出block不足导致，需要减小`kv-cache-ratio`
+```
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 133， encoder block len: 24
+recover seq_id: 2， free_list_len: 144， used_list_len: 134
+need_block_len: 1， free_list_len: 0
+step max_id: 2， max_num: 144， encoder_block_len: 24
+```
+同样建议启用服务管理全局 Block功能，在启动服务前，加入环境变量
+```
+export ENABLE_V1_KVCACHE_SCHEDULER=1
+```
+
+2. 检查自动profile分配的KVCache block是否符合预期，如果自动profile中受到显存波动影响可能导致分配偏少，可以通过手工设置`num_gpu_blocks_override`参数扩大KVCache block。
--- a/docs/zh/usage/kunlunxin_xpu_deployment.md
+++ b/docs/zh/usage/kunlunxin_xpu_deployment.md
@@ -5,12 +5,12 @@
 |ERNIE-4.5-300B-A47B|32K|WINT4|4 （推荐）|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 4 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
 |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
 |ERNIE-4.5-300B-A47B|128K|WINT4|8 （推荐）|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 8 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 64 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.0|
-|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
-|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
+|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
+|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint4" \ <br>    --gpu-memory-utilization 0.9|>=2.1.0|
 |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
 |ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 32768 \ <br>    --max-num-seqs 128 \ <br>    --quantization "wint8" \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
 |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br>    --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br>    --port 8188 \ <br>    --tensor-parallel-size 1 \ <br>    --max-model-len 131072 \ <br>    --max-num-seqs 128 \ <br>    --gpu-memory-utilization 0.9|>=2.0.3|
--- a/fastdeploy/init.py
+++ b/fastdeploy/init.py
@@ -24,8 +24,13 @@ os.environ["GLOG_minloglevel"] = "2"
 os.environ["AISTUDIO_LOG"] = "critical"
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.llm import LLM
+from fastdeploy.utils import version, envs
+from paddleformers.utils.log import logger as pf_logger
+if envs.FD_DEBUG != "1":
+    import logging
+    pf_logger.logger.setLevel(logging.INFO)

-__all__ = ["LLM", "SamplingParams"]
+__all__ = ["LLM", "SamplingParams", "version"]

 try:
    import use_triton_in_paddle
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -142,6 +142,7 @@ class CacheMessager:

        self.gpu_id = gpu_id
        self.cache_info = dict()
+        self.dp_rank_id = self.rank + local_data_parallel_id * self.nranks

        layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread)
        layerwise_send_cache_thread.daemon = True
@@ -159,14 +160,14 @@ class CacheMessager:
            prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
            try:
                step_shm_value = IPCSignal(
-                    name=f"splitwise_complete_prefilled_step_{self.rank}",
+                    name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
                    array=prefilled_step_idx_data,
                    dtype=np.int32,
                    suffix=self.gpu_id,
                    create=True,
                )
                layer_shm_value = IPCSignal(
-                    name=f"splitwise_complete_prefilled_layer_{self.rank}",
+                    name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}",
                    array=prefilled_layer_idx_data,
                    dtype=np.int32,
                    suffix=self.gpu_id,
@@ -174,14 +175,14 @@ class CacheMessager:
                )
            except:
                step_shm_value = IPCSignal(
-                    name=f"splitwise_complete_prefilled_step_{self.rank}",
+                    name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
                    array=prefilled_step_idx_data,
                    dtype=np.int32,
                    suffix=self.gpu_id,
                    create=False,
                )
                layer_shm_value = IPCSignal(
-                    name=f"splitwise_complete_prefilled_layer_{self.rank}",
+                    name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}",
                    array=prefilled_layer_idx_data,
                    dtype=np.int32,
                    suffix=self.gpu_id,
--- a/fastdeploy/cache_manager/cache_transfer_manager.py
+++ b/fastdeploy/cache_manager/cache_transfer_manager.py
@@ -24,7 +24,7 @@ import numpy as np
 import paddle

 from fastdeploy.cache_manager.cache_data import CacheStatus
-from fastdeploy.engine.config import SpeculativeConfig
+from fastdeploy.config import SpeculativeConfig
 from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
 from fastdeploy.model_executor.ops.gpu import (
    cuda_host_alloc,
@@ -114,7 +114,7 @@ class CacheTransferManager:
        self.cpu_cache_kvs = {}
        self.gpu_cache_k_tensors = []
        self.gpu_cache_v_tensors = []
-        self.speculative_config = SpeculativeConfig(**args.speculative_config)
+        self.speculative_config = SpeculativeConfig(args.speculative_config)
        self.num_extra_layers = self.speculative_config.num_extra_cache_layer
        self.num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio)

--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -64,7 +64,10 @@ class PrefixCacheManager:
        self.speculative_config = config.speculative_config
        self.local_data_parallel_id = local_data_parallel_id

-        self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
+        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
+            self.num_gpu_blocks = self.cache_config.total_block_num
+        else:
+            self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
        self.num_cpu_blocks = self.cache_config.num_cpu_blocks
        self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
        if self.num_cpu_blocks > 0:
@@ -93,6 +96,7 @@ class PrefixCacheManager:
        self.req_leaf_map = {}  # {request_id: leaf node}
        self.leaf_req_map = defaultdict(set)
        self.unfilled_req_block_map = defaultdict(list)
+        self.cache_info = {}

        self.executor_pool = ThreadPoolExecutor(max_workers=1)
        self.free_gpu_executor_pool = ThreadPoolExecutor(max_workers=1)
@@ -168,7 +172,7 @@ class PrefixCacheManager:
                + f" --device_id {int(device_ids[i])}"
                + f" --rank {i}"
                + f" --splitwise_role {self.splitwise_role}"
-                + f" --num_layers {cache_config.model_cfg.num_layers}"
+                + f" --num_layers {cache_config.model_cfg.num_hidden_layers}"
                + f" --head_dim {cache_config.model_cfg.head_dim}"
                + f" --kv_num_head {kv_num_head}"
                + f" --mp_num {tensor_parallel_size}"
@@ -425,6 +429,135 @@ class PrefixCacheManager:

        return gpu_recv_block_ids, gpu_extra_block_ids

+    def get_required_block_num(self, input_token_num, block_size):
+        """
+        get required block num by input token num and block size
+        """
+        return (input_token_num + block_size - 1) // block_size
+
+    def update_cache_blocks(self, task, block_size):
+        """
+        update cache blocks for a task.
+        # TODO(chengyanfu): support async update
+
+        Parameters:
+        - task: Task
+        - block_size: Size per block (in tokens)
+        """
+        try:
+            req_id = task.request_id
+            num_cached_tokens = task.num_cached_tokens
+            block_tables = task.block_tables
+
+            last_node, input_ids = self.cache_info[req_id]
+            left_input_ids = input_ids[num_cached_tokens:]
+            gpu_extra_block_ids = block_tables[num_cached_tokens // block_size :]
+
+            with self.request_release_lock:
+                current_time = time.time()
+                leaf_node = self.build_path(
+                    req_id=req_id,
+                    current_time=current_time,
+                    input_ids=input_ids,
+                    left_input_ids=left_input_ids,
+                    gpu_block_ids=gpu_extra_block_ids,
+                    block_size=block_size,
+                    last_node=last_node,
+                    reverved_dec_block_num=0,
+                )
+                self.req_leaf_map[req_id] = leaf_node
+                self.leaf_req_map[leaf_node].add(req_id)
+                self.cache_info[req_id] = (leaf_node, input_ids)
+        except Exception as e:
+            logger.error(f"update_cache_blocks, error: {type(e)} {e}")
+            raise e
+
+    def request_match_blocks(self, task, block_size, *args):
+        """
+        get match blocks info for a task.
+        This is a synchronous interface. If CPU-to-GPU data transfer occurs,
+        it will block until synchronization completes.
+        Callers requiring asynchronous behavior should invoke this via a thread pool.
+
+        Note: This function may allocate GPU blocks for matched CPU Cache
+
+        Parameters:
+        - task: Task dictionary
+        - block_size: Size per block (in tokens)
+
+        Returns:
+        - common_block_ids: List of matched shared blocks
+        - unique_block_ids: List of exclusively allocated blocks
+        """
+        with self.request_release_lock:
+            try:
+                hit_info = {}
+                hit_info["gpu_cache_blocks"] = 0
+                hit_info["cpu_cache_blocks"] = 0
+                self.metrics.req_count += 1
+                input_ids = task.prompt_token_ids
+                req_id = task.request_id
+                logger.info(f"request_block_ids: start to allocate blocks for req_id {req_id}")
+                input_token_num = len(input_ids)
+                common_block_ids = []
+                # 1. match block
+                (
+                    match_gpu_block_ids,
+                    match_cpu_block_ids,
+                    swap_node_ids,
+                    match_block_node,
+                    gpu_match_token_num,
+                    cpu_match_token_num,
+                ) = self.match_block(req_id, input_ids, block_size)
+
+                #  update matched node info
+                self._update_matched_node_info(req_id, match_block_node, current_time=time.time())
+
+                # 2. prepare cache
+                #  allocate gpu cache for matched cpu blocks
+                gpu_recv_block_ids = []
+                match_cpu_blocks_num = len(match_cpu_block_ids)
+                if self.can_allocate_gpu_blocks(num_blocks=match_cpu_blocks_num):
+                    if match_cpu_blocks_num > 0:
+                        gpu_recv_block_ids = self.allocate_gpu_blocks(match_cpu_blocks_num)
+                        if len(gpu_recv_block_ids) > 0:
+                            self._prepare_cpu_cache(
+                                req_id=req_id,
+                                swap_node_ids=swap_node_ids,
+                                gpu_recv_block_ids=gpu_recv_block_ids,
+                                match_cpu_block_ids=match_cpu_block_ids,
+                                cpu_recv_block_ids=[],
+                            )
+                else:
+                    raise Exception("Not enough GPU memory to allocate cache for matched CPU Cache")
+
+                #  record request cache info
+                self.cache_info[req_id] = (match_block_node, input_ids)
+
+                # 3. update metrics
+                matched_token_num = gpu_match_token_num + cpu_match_token_num
+                common_block_ids = match_gpu_block_ids + gpu_recv_block_ids
+                if matched_token_num > 0:
+                    self.metrics.hit_req_count += 1
+                self.metrics.calculate_hit_metrics(
+                    req_id,
+                    cpu_match_token_num,
+                    gpu_match_token_num,
+                    input_token_num,
+                )
+                hit_info["gpu_cache_blocks"] = gpu_match_token_num // block_size
+                hit_info["cpu_cache_blocks"] = cpu_match_token_num // block_size
+                self.metrics._update_history_hit_metrics()
+                if self.metrics.req_count % 10000 == 0:
+                    self.metrics.reset_metrics()
+                logger.info(
+                    f"request_block_ids: request block for req_id {req_id}: common_block_ids {common_block_ids}"
+                )
+                return common_block_ids, matched_token_num, hit_info
+            except Exception as e:
+                logger.error(f"request_block_ids: error: {type(e)} {e}")
+                raise e
+
    def request_block_ids(self, task, block_size, dec_token_num, *args):
        """
        Allocate blocks for a task.
@@ -463,12 +596,10 @@ class PrefixCacheManager:
                    cpu_match_token_num,
                ) = self.match_block(req_id, input_ids, block_size)
                match_gpu_blocks_num = len(match_gpu_block_ids)
-                match_cpu_blocks_num = len(match_cpu_block_ids)
-                matched_block_num = match_gpu_blocks_num + match_cpu_blocks_num
                matched_token_num_in_cpu_and_gpu = gpu_match_token_num + cpu_match_token_num
                # check enough gpu memory to allocate cache
                block_num = (input_token_num + block_size - 1 + dec_token_num) // block_size
-                self._check_validity(req_id, matched_block_num, block_num)
+                self._check_validity(req_id, match_gpu_blocks_num, block_num)
                # update matched node info
                current_time = time.time()
                self._update_matched_node_info(req_id, match_block_node, current_time)
@@ -557,6 +688,9 @@ class PrefixCacheManager:
                    node.decrement_shared_count()
                    node = node.parent

+                if req_id in self.cache_info:
+                    del self.cache_info[req_id]
+
                logger.info(f"release_block_ids: req_id {req_id} leaf_node {leaf_node}")

                if leaf_node == self.radix_tree_root:
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp
@@ -336,7 +336,7 @@ QpStatus modify_qp_to_rts(
    return QpStatus::kSuccess;
 }

-static QpInfo* client_exch_dest(
+static std::shared_ptr<QpInfo> client_exch_dest(
        struct RdmaContext *ctx,
        const std::string &dst_ip,
        int port,
@@ -403,12 +403,10 @@ static QpInfo* client_exch_dest(
        return nullptr;
    }

-    QpInfo* rem_dest = new QpInfo();
-    if (!rem_dest) {
-        WARN("Failed to allocate memory for remote destination");
-        close(sockfd);
-        return nullptr;
-    }
+    // I think no need to check memory allocate, because once allocate failed,
+    // that's mean the process encountering OOM, let it crash then check whether
+    // the code logic has memory leak or not.
+    auto rem_dest = std::make_shared<QpInfo>();
    rem_dest->deserialize(buffer);
    return rem_dest;
 }
@@ -634,21 +632,20 @@ bool client_exchange_destinations(
    }

    // Exchange destination info with remote
-    struct QpInfo* temp_rem_dest = client_exch_dest(ctx, dst_ip, port, &my_dest);
-    if (!temp_rem_dest) {
+    auto rem_dest = client_exch_dest(ctx, dst_ip, port, &my_dest);
+    if (!rem_dest) {
        ERR("Failed to exchange destination info with %s:%u", dst_ip.c_str(), port);
        return false;
    }

-    struct QpInfo rem_dest = *temp_rem_dest;
-    LOGD("Remote address - LID: 0x%04x, QPN: 0x%06x, PSN: 0x%06x, Mtu: %u",rem_dest.lid, rem_dest.qpn, rem_dest.psn, temp_rem_dest->mtu);
+    LOGD("Remote address - LID: 0x%04x, QPN: 0x%06x, PSN: 0x%06x, Mtu: %u",
+        rem_dest->lid, rem_dest->qpn, rem_dest->psn, rem_dest->mtu);

    // Modify QP to RTS state
-    if (modify_qp_to_rts(ctx, ib_port, my_dest.psn, &rem_dest, gidx) != QpStatus::kSuccess) {
+    if (modify_qp_to_rts(ctx, ib_port, my_dest.psn, rem_dest.get(), gidx) != QpStatus::kSuccess) {
        ERR("Failed to modify QP 0x%x to RTS state", ctx->qp->qp_num);
        return false;
    }
-    delete temp_rem_dest;

    LOGD("Successfully established connection to %s:%u", dst_ip.c_str(), port);

--- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py
+++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py
@@ -45,7 +45,7 @@ class RDMACommManager:
            return
        self.messager = rdma_comm.RDMACommunicator(
            splitwise_role,
-            rank,
+            gpu_id,
            str(rdma_port) if splitwise_role == "decode" else "0",
            cache_k_ptr_list,
            cache_v_ptr_list,
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -16,27 +16,42 @@

 from __future__ import annotations

+import json
 import os
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Literal, Optional
+from typing import Literal, Optional, Union

 from paddleformers.transformers.configuration_utils import PretrainedConfig

+import fastdeploy
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
-from fastdeploy.utils import get_logger
+from fastdeploy.utils import check_unified_ckpt, get_logger

 logger = get_logger("config", "config.log")

+TaskOption = Literal["generate"]

-class MoEPhase(Enum):
+
+class MoEPhase:
    """
    The generation phase of the moe.
    """

-    PREFILL = 1
-    DECODER = 2
+    def __init__(self, phase="prefill"):
+        self._phase = phase
+
+    @property
+    def phase(self):
+        return self._phase
+
+    @phase.setter
+    def phase(self, value):
+        if value not in ["prefill", "decode"]:
+            raise ValueError(f"The moe_phase is invalid, only support prefill and decode, but got {value}")
+        else:
+            self._phase = value


 class ErnieArchitectures:
@@ -60,7 +75,13 @@ class ErnieArchitectures:


 PRETRAINED_INIT_CONFIGURATION = {
+    "top_p": 1.0,
+    "temperature": 1.0,
    "rope_theta": 10000.0,
+    "penalty_score": 1.0,
+    "frequency_score": 0.0,
+    "presence_score": 0.0,
+    "min_length": 1,
    "num_key_value_heads": -1,
    "start_layer_index": 0,
    "moe_num_shared_experts": 0,
@@ -88,19 +109,7 @@ class ModelConfig:
        self,
        args,
    ):
-        self.max_stop_seqs_num = 5
-        self.stop_seqs_max_len = 8
-
-        # NOTE(gongshaotain): form _load_model_init_val()
-        self.top_p = 1.0
-        self.temperature = 1.0
-        self.rope_theta = 10000.0
-        self.penalty_score = 1.0
-        self.frequency_score = 0.0
-        self.presence_score = 0.0
-        self.min_length = 1
-        self.model_name_or_path = ""
-
+        self.model = ""
        self.is_quantized = False
        self.max_model_len = 0
        self.dtype = ""
@@ -108,13 +117,13 @@ class ModelConfig:
        self.enable_mm = False
        self.enable_redundant_experts = False
        self.redundant_experts_num = 0
-
+        self.quantization = None
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)

-        assert self.model_name_or_path != ""
-        pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path)
+        assert self.model != ""
+        pretrained_config, _ = PretrainedConfig.get_config_dict(self.model)
        self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)

        # set attribute from pretrained_config
@@ -136,6 +145,64 @@ class ModelConfig:
        if ErnieArchitectures.contains_ernie_arch(self.architectures):
            self.ori_vocab_size = args.get("ori_vocab_size", self.ori_vocab_size)

+        self.is_unified_ckpt = check_unified_ckpt(self.model)
+
+        self.override_name_from_config()
+        self.read_from_env()
+
+    def override_name_from_config(self):
+        """
+        Override attribute names from the exported model's configuration.
+        """
+
+        if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
+            self.tensor_parallel_size = self.infer_model_mp_num
+            del self.infer_model_mp_num
+
+        if hasattr(self, "num_hidden_layers"):
+            if hasattr(self, "remove_tail_layer"):
+                if self.remove_tail_layer is True:
+                    self.num_hidden_layers -= 1
+                elif isinstance(self.remove_tail_layer, int):
+                    self.num_hidden_layers -= self.remove_tail_layer
+
+        if not hasattr(self, "mla_use_absorb"):
+            self.mla_use_absorb = False
+
+    def read_from_env(self):
+        """
+        Read configuration information from environment variables and update the object's attributes.
+
+        If an attribute is not present or is an empty string in the environment variables, use the default value.
+        """
+        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
+        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
+
+        def reset_config_value(key, value):
+            if not hasattr(self, key.lower()):
+                if os.getenv(key, None):
+                    value = eval(os.getenv(key))
+                    logger.info(f"Get parameter `{key}` = {value} from environment.")
+                else:
+                    logger.info(f"Parameter `{key}` will use default value {value}.")
+                setattr(self, key.lower(), value)
+
+        reset_config_value("COMPRESSION_RATIO", 1.0)
+        reset_config_value("ROPE_THETA", 10000)
+
+    def _get_download_model(self, model_name, model_type="default"):
+        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
+        pass
+
+    def print(self):
+        """
+        Print all configuration information.
+        """
+        logger.info("Model Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")
+

 class ParallelConfig:
    """Configuration for the distributed execution."""
@@ -146,13 +213,16 @@ class ParallelConfig:
    ):
        self.sequence_parallel = False  # Whether to enable sequence parallelism.
        self.use_ep = False  # Whether to enable Expert Parallelism
-        self.moe_phase = MoEPhase.PREFILL  # Generation phase
+        self.moe_phase = MoEPhase("prefill")  # Generation phase
        self.msg_queue_id = 1  # mesage queue id

        self.tensor_parallel_rank = 0  # TP rank ID
        self.tensor_parallel_size = 1  # TP degree
        self.expert_parallel_rank = 0  # EP rank ID
        self.expert_parallel_size = 1  # EP degree
+        self.data_parallel_size = 1  # DP degree
+        self.enable_expert_parallel = False
+        self.local_data_parallel_id = 0
        # The embedding weight distributed on your gpu cards is divided by row or column.
        # Defaults to False means divide by row. When vocab_size can not be divided by world_size
        # but hidden_size can, we can consider split embedding weight by column.
@@ -160,7 +230,6 @@ class ParallelConfig:
        From old wersion worker args
        TODO(gongshaotian): Reclassify
        """
-        self.model_name_or_path: str = "./output"
        self.max_num_seqs: int = 34
        # Set default block num for profile run
        self.total_block_num: int = 2000
@@ -176,12 +245,8 @@ class ParallelConfig:
        self.dtype: str = "bfloat16"
        # Encoder's decoder num
        self.enc_dec_block_num: int = 1
-        # KV cache ratio for input
-        self.kv_cache_ratio: float = 0.7
        # First token id
        self.first_token_id: int = 1
-        # Gpu memory utilization
-        self.gpu_memory_utilization: float = 0.9
        # Process ID of engine
        self.engine_pid: Optional[int] = None
        # Do profile or not
@@ -190,12 +255,8 @@ class ParallelConfig:
        self.pad_token_id: int = -1
        #
        self.eos_tokens_lens: int = 2
-        # Enable chunked prefill
-        self.enable_chunked_prefill: bool = False

        self.max_num_batched_tokens: int = 2048
-        # enable prefix cache
-        self.enable_prefix_caching = None
        # splitwise role
        self.splitwise_role: str = "mixed"
        # guided decoding backend
@@ -208,13 +269,16 @@ class ParallelConfig:
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)
-        self.use_ep = args["expert_parallel_size"] > 1
+
+        # currently, the expert parallel size is equal data parallel size
+        self.expert_parallel_size = self.data_parallel_size
+        self.use_ep = self.expert_parallel_size > 1
        if self.splitwise_role == "mixed":
-            self.moe_phase = MoEPhase.PREFILL
+            self.moe_phase = MoEPhase(phase="prefill")
        elif self.splitwise_role == "prefill":
-            self.moe_phase = MoEPhase.PREFILL
+            self.moe_phase = MoEPhase(phase="prefill")
        elif self.splitwise_role == "decode":
-            self.moe_phase = MoEPhase.DECODER
+            self.moe_phase = MoEPhase(phase="decode")
        else:
            raise NotImplementedError

@@ -228,6 +292,16 @@ class ParallelConfig:
        else:
            self.pd_disaggregation_mode = "None"

+    def print(self):
+        """
+        print all config
+
+        """
+        logger.info("Parallel Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")
+

 class SpeculativeConfig:
    """
@@ -249,7 +323,7 @@ class SpeculativeConfig:
        # ngram match
        self.max_ngram_size: int = 5
        # model for mtp/eagle/draft_model
-        self.model_name_or_path: Optional[str] = None
+        self.model: Optional[str] = None
        # quantization of model
        self.quantization: Optional[str] = None
        # allocate more blocks to prevent mtp from finishing the block earlier than the main model
@@ -267,21 +341,75 @@ class SpeculativeConfig:
        # This ensures that the specified simulation acceptance rate is not affected.
        self.benchmark_mode: bool = False

-        # TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig.
-        # We temperately add the name map here and will delete it in future.
-        name_map = {
-            "speculative_method": "method",
-            "speculative_max_draft_token_num": "num_speculative_tokens",
-            "speculative_model_name_or_path": "model_name_or_path",
-            "speculative_model_quantization": "quantization",
-            "speculative_benchmark_mode": "benchmark_mode",
-        }
+        self.num_extra_cache_layer = 0

        for key, value in args.items():
-            if key in name_map.keys() and hasattr(self, name_map[key]):
-                if key == "speculative_benchmark_mode":
-                    value = True if value.lower() == "true" else False
-                setattr(self, name_map[key], value)
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+        self.read_model_config()
+        self.reset()
+
+    def read_model_config(self):
+        """
+        Read configuration from file.
+        """
+        self.model_config = {}
+        if not self.enabled_speculative_decoding():
+            return
+
+        self.is_unified_ckpt = check_unified_ckpt(self.model)
+        if self.model is None:
+            return
+
+        self.config_path = os.path.join(self.model, "config.json")
+        if os.path.exists(self.config_path):
+            self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
+
+    def reset(self):
+        """
+        Reset configuration.
+        """
+
+        def reset_value(cls, value_name, key=None, default=None):
+            if key is not None and key in cls.model_config:
+                setattr(cls, value_name, cls.model_config[key])
+            elif getattr(cls, value_name, None) is None:
+                setattr(cls, value_name, default)
+
+        if not self.enabled_speculative_decoding():
+            return
+
+        # NOTE(liuzichang): We will support multi-layer in future
+        if self.method in ["mtp"]:
+            self.num_extra_cache_layer = 1
+
+    def enabled_speculative_decoding(self):
+        """
+        Check if speculative decoding is enabled.
+        """
+        if self.method is None:
+            return False
+        return True
+
+    def to_json_string(self):
+        """
+        Convert speculative_config to json string.
+        """
+        return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
+
+    def print(self):
+        """
+        print all config
+
+        """
+        logger.info("Speculative Decoding Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")
+
+    def __str__(self) -> str:
+        return self.to_json_string()


 class DeviceConfig:
@@ -299,60 +427,69 @@ class DeviceConfig:
                setattr(self, key, value)


-@dataclass
 class GraphOptimizationConfig:
    """
    Configuration for compute graph level optimization.
    """

-    """The Top-level graph optimization contral corresponds to different backends.
-    - 0: dyncmic graph
-    - 1: static graph
-    - 2: static graph + cinn compilation backend
-    """
-    graph_opt_level: int = 0
+    def __init__(
+        self,
+        args,
+    ):
+        """The Top-level graph optimization contral corresponds to different backends.
+        - 0: dyncmic graph
+        - 1: static graph
+        - 2: static graph + cinn compilation backend
+        """
+        self.graph_opt_level: int = 0

-    # CUDA Graph Config
-    """ Whether to use cudagraph.
-    - False: cudagraph is not used.
-    - True: cudagraph is used.
-        It requires that all input buffers have fixed addresses, and all
-        splitting ops write their outputs to input buffers.
-        - With dyncmic graph backend: ...
-        - With static grpah backend: WIP
-    """
-    sot_warmup_sizes: Optional[list[int]] = field(default_factory=list)
-    """  Number of warmup runs for SOT warmup. """
-    use_cudagraph: bool = False
-    """Sizes to capture cudagraph.
-    - None (default): capture sizes are inferred from llm config.
-    - list[int]: capture sizes are specified as given."""
-    cudagraph_capture_sizes: Optional[list[int]] = None
-    """ Number of warmup runs for cudagraph. """
-    cudagraph_num_of_warmups: int = 2
-    """Whether to copy input tensors for cudagraph.
-    If the caller can guarantee that the same input buffers
-    are always used, it can set this to False. Otherwise, it should
-    set this to True."""
-    cudagraph_copy_inputs: bool = False
-    """ In static graph, this is an operation list that does not need to be captured by the CUDA graph.
-    CudaGraphBackend will split these operations from the static graph.
-    Example usage:
-        cudagraph_splitting_ops = ["paddle.unified_attention"]
+        # CUDA Graph Config
+        """ Whether to use cudagraph.
+        - False: cudagraph is not used.
+        - True: cudagraph is used.
+            It requires that all input buffers have fixed addresses, and all
+            splitting ops write their outputs to input buffers.
+            - With dyncmic graph backend: ...
+            - With static grpah backend: WIP
+        """
+        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
+        """  Number of warmup runs for SOT warmup. """
+        self.use_cudagraph: bool = False
+        """Sizes to capture cudagraph.
+        - None (default): capture sizes are inferred from llm config.
+        - list[int]: capture sizes are specified as given."""
+        self.cudagraph_capture_sizes: Optional[list[int]] = None
+        """ Number of warmup runs for cudagraph. """
+        self.cudagraph_num_of_warmups: int = 2
+        """Whether to copy input tensors for cudagraph.
+        If the caller can guarantee that the same input buffers
+        are always used, it can set this to False. Otherwise, it should
+        set this to True."""
+        self.cudagraph_copy_inputs: bool = False
+        """ In static graph, this is an operation list that does not need to be captured by the CUDA graph.
+        CudaGraphBackend will split these operations from the static graph.
+        Example usage:
+            cudagraph_splitting_ops = ["paddle.unified_attention"]

-    Note: If want to use subgraph capture functionality in a dynamic graph,
-    can manually split the model into multiple layers and apply the @support_graph_optimization decorator
-    only to the layer where CUDA graph functionality is required.
-    """
-    cudagraph_splitting_ops: list[str] = field(default_factory=list)
-    """ Whether to use a full cuda graph for the entire forward pass rather than
-    splitting certain operations such as attention into subgraphs.
-    Thus this flag cannot be used together with splitting_ops."""
-    full_cuda_graph: bool = True
+        Note: If want to use subgraph capture functionality in a dynamic graph,
+        can manually split the model into multiple layers and apply the @support_graph_optimization decorator
+        only to the layer where CUDA graph functionality is required.
+        """
+        self.cudagraph_splitting_ops: list[str] = []
+        """ Whether to use a full cuda graph for the entire forward pass rather than
+        splitting certain operations such as attention into subgraphs.
+        Thus this flag cannot be used together with splitting_ops."""
+        self.full_cuda_graph: bool = True

-    max_capture_size: int = field(default=None, init=False)  # type: ignore
-    batch_size_to_captured_size: dict[int, int] = field(default=None, init=False)  # type: ignore
-    # CINN Config ...
+        self.max_capture_size: int = None
+        self.batch_size_to_captured_size: dict[int, int] = None
+        # CINN Config ...
+        if args is not None:
+            for key, value in args.items():
+                if hasattr(self, key):
+                    setattr(self, key, value)
+
+        self.check_legality_parameters()

    def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
        """
@@ -399,6 +536,130 @@ class GraphOptimizationConfig:
        draft_capture_sizes.append(max_num_seqs)
        self.cudagraph_capture_sizes = sorted(draft_capture_sizes)

+    def to_json_string(self):
+        """
+        Convert speculative_config to json string.
+        """
+        return json.dumps({key: value for key, value in self.__dict__.items()})
+
+    def __str__(self) -> str:
+        return self.to_json_string()
+
+    def check_legality_parameters(
+        self,
+    ) -> None:
+        """Check the legality of parameters passed in from the command line"""
+
+        if self.graph_opt_level is not None:
+            assert self.graph_opt_level in [
+                0,
+                1,
+                2,
+            ], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
+        if self.use_cudagraph is not None:
+            assert (
+                type(self.use_cudagraph) is bool
+            ), "In graph optimization config, type of use_cudagraph must is bool."
+        if self.cudagraph_capture_sizes is not None:
+            assert (
+                type(self.cudagraph_capture_sizes) is list
+            ), "In graph optimization config, type of cudagraph_capture_sizes must is list."
+            assert (
+                len(self.cudagraph_capture_sizes) > 0
+            ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
+
+    def update_use_cudagraph(self, argument: bool):
+        """
+        Unified user specifies the use_cudagraph parameter through two methods,
+        '--use-cudagraph' and '--graph-optimization-config'
+        """
+        if self.use_cudagraph is None:
+            # User only set '--use-cudagraph'
+            self.use_cudagraph = argument
+        else:
+            # User both set '--use-cudagraph' and '--graph-optimization-config'
+            if self.use_cudagraph is False and argument is True:
+                raise ValueError(
+                    "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
+                )
+            argument = self.use_cudagraph
+
+
+class EarlyStopConfig:
+    def __init__(
+        self,
+        args,
+    ):
+        """
+        Early Stop Configuration class.
+
+        Attributes:
+            window_size: size of the window
+            threshold: trigger early stop when the ratio of probs exceeds the threshold
+        """
+        """enable to use early stop"""
+        self.enable_early_stop: bool = False
+        """strategy for early stop, the strategy lists are ['repetition']"""
+        self.strategy: str = "repetition"
+        """ the maximum length of verify window for early stop """
+        self.window_size: int = 3000
+        """ the probs threshold for early stop """
+        self.threshold: float = 0.99
+
+        if args is not None:
+            for key, value in args.items():
+                if hasattr(self, key):
+                    setattr(self, key, value)
+        self.check_legality_parameters()
+
+    def to_json_string(self):
+        """
+        Convert early_stop_config to json string.
+        """
+        return json.dumps({key: value for key, value in self.__dict__.items()})
+
+    def __str__(self) -> str:
+        return self.to_json_string()
+
+    def check_legality_parameters(
+        self,
+    ) -> None:
+        """Check the legality of parameters passed in from the command line"""
+        if self.enable_early_stop is not None:
+            assert isinstance(
+                self.enable_early_stop, bool
+            ), "In early stop config, type of enable_early_stop must is bool."
+        if self.window_size is not None:
+            assert isinstance(self.window_size, int), "In early stop config, type of window_size must be int."
+            assert self.window_size > 0, "window_size must large than 0"
+        if self.threshold is not None:
+            assert isinstance(self.threshold, float), "In early stop config, type of threshold must be float."
+            assert self.threshold >= 0 and self.threshold <= 1, "threshold must between 0 and 1"
+
+    def update_enable_early_stop(self, argument: bool):
+        """
+        Unified user specifies the enable_early_stop parameter through two methods,
+        '--enable-early-stop' and '--early-stop-config'
+        """
+        if self.enable_early_stop is None:
+            # User only set '--enable-early-stop'
+            self.enable_early_stop = argument
+        else:
+            # User both set '--enable-early-stop' and '--early-stop-config'
+            if self.enable_early_stop is False and argument is True:
+                raise ValueError(
+                    "Invalid parameter: Cannot set ---enable-early-stop and --early-stop-config '{\"enable_early_stop\":false}' simultaneously."
+                )
+            argument = self.enable_early_stop
+
+
+class LoadChoices(str, Enum):
+    """LoadChoices"""
+
+    DEFAULT = "default"
+    # only support qwen3-bf16 now
+    NEW_LOADER = "new_loader"
+

 class LoadConfig:
    """
@@ -416,6 +677,7 @@ class LoadConfig:
        self,
        args,
    ):
+        self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
        self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
        self.dynamic_load_weight: bool = False
        self.load_strategy: Optional[Literal["ipc", "ipc_snapshot"]] = None
@@ -430,10 +692,162 @@ class LoRAConfig:
    pass


-class KVCacheConfig:
-    """KV Cache Config"""
+class CacheConfig:
+    """
+    Configuration for the KV cache.

-    cache_quant_dtype: str = "none"
+    Attributes:
+        block_size (int): Size of a cache block in number of tokens.
+        gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
+        cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
+        num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
+        Overrides profiled num_gpu_blocks if provided.
+        kv_cache_ratio (float): Ratio for calculating the maximum block number.
+        enc_dec_block_num (int): Number of encoder-decoder blocks.
+        prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
+        enable_prefix_caching (bool): Flag to enable prefix caching.
+    """
+
+    def __init__(self, args):
+        """
+        Initialize the CacheConfig class.
+
+        Args:
+            block_size (int): Size of a cache block in number of tokens.
+            gpu_memory_utilization (float): Fraction of GPU memory to use.
+            cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
+            num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
+            num_cpu_blocks (Optional[int]): Number of CPU blocks.
+            kv_cache_ratio (float): Ratio for max block calculation.
+            enc_dec_block_num (int): Number of encoder-decoder blocks.
+            prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
+            enable_prefix_caching (bool): Enable prefix caching.
+        """
+        self.block_size = 64
+        self.gpu_memory_utilization = 0.9
+        self.num_gpu_blocks_override = None
+        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
+            self.kv_cache_ratio = 1.0
+        else:
+            self.kv_cache_ratio = 0.75
+        self.enc_dec_block_num = 2
+        self.prealloc_dec_block_slot_num_threshold = 5
+        self.cache_dtype = "bfloat16"
+        self.model_cfg = None
+        self.enable_chunked_prefill = False
+        self.rdma_comm_ports = None
+        self.cache_transfer_protocol = None
+        self.pd_comm_port = None
+        self.enable_prefix_caching = False
+        self.enable_ssd_cache = False
+        self.cache_queue_port = None
+        self.swap_space = None
+        for key, value in args.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+        if self.rdma_comm_ports is not None and isinstance(self.rdma_comm_ports, str):
+            self.rdma_comm_ports = self.rdma_comm_ports.split(",")
+
+        if self.pd_comm_port is not None and isinstance(self.pd_comm_port, str):
+            self.pd_comm_port = [int(port) for port in self.pd_comm_port.split(",")]
+
+        if self.swap_space is None:
+            self.enable_hierarchical_cache = False
+        else:
+            self.enable_hierarchical_cache = True
+
+        if self.model_cfg is not None:
+            if self.model_cfg.quantization_config is not None:
+                self.cache_dtype = self.model_cfg.quantization_config.get("kv_cache_quant_type", self.cache_dtype)
+            if (
+                hasattr(self.model_cfg, "num_key_value_heads")
+                and hasattr(self.model_cfg, "num_key_value_heads")
+                and self.model_cfg.num_key_value_heads is not None
+                and int(self.model_cfg.num_key_value_heads) > 0
+            ):
+                kv_num_head = int(self.model_cfg.num_key_value_heads)
+            else:
+                kv_num_head = self.model_cfg.num_attention_heads
+            self.model_cfg.kv_num_head = kv_num_head
+            # TODO check name
+            if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
+                byte_size = 0.5
+                self.cache_dtype = "uint8"
+            elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
+                self.cache_dtype = "uint8"
+                byte_size = 1
+            else:
+                byte_size = 2
+            self.each_token_cache_space = int(
+                self.model_cfg.num_hidden_layers * kv_num_head * self.model_cfg.head_dim * byte_size
+            )
+            self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
+            self.bytes_per_layer_per_block = int(
+                self.block_size
+                * self.model_cfg.kv_num_head
+                * self.model_cfg.head_dim
+                // args["tensor_parallel_size"]
+                * byte_size
+            )
+
+        if self.swap_space is None:
+            self.num_cpu_blocks = 0
+        else:
+            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
+        self._verify_args()
+
+    def metrics_info(self):
+        """Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
+        return {key: str(value) for key, value in self.__dict__.items()}
+
+    def _verify_args(self):
+        if self.gpu_memory_utilization > 1.0:
+            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
+        if self.kv_cache_ratio > 1.0:
+            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
+
+    def postprocess(self, num_total_tokens, number_of_tasks):
+        """
+        calculate block num
+        """
+        self.dec_token_num = self.enc_dec_block_num * self.block_size
+        if self.num_gpu_blocks_override is not None:
+            self.total_block_num = self.num_gpu_blocks_override
+            if envs.ENABLE_V1_KVCACHE_SCHEDULER:
+                self.prefill_kvcache_block_num = self.total_block_num
+            else:
+                self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
+        else:
+            length = num_total_tokens // number_of_tasks
+            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
+            self.total_block_num = block_num * number_of_tasks
+            self.prefill_kvcache_block_num = self.total_block_num
+            logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
+
+    def reset(self, num_gpu_blocks):
+        """
+        reset gpu block number
+        """
+        self.total_block_num = num_gpu_blocks
+        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
+            self.prefill_kvcache_block_num = self.total_block_num
+        else:
+            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
+        logger.info(
+            f"Reset block num, the total_block_num:{self.total_block_num},"
+            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
+        )
+
+    def print(self):
+        """
+        print all config
+
+        """
+        logger.info("Cache Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")


 class DecodingConfig:
@@ -451,6 +865,63 @@ class DecodingConfig:
                setattr(self, key, value)


+class CommitConfig:
+    """
+    Configuration for tracking version information from version.txt
+
+    Attributes:
+        fastdeploy_commit: Full FastDeploy git commit hash
+        paddle_version: PaddlePaddle version string
+        paddle_commit: PaddlePaddle git commit hash
+        cuda_version: CUDA version string
+        compiler_version: CXX compiler version string
+    """
+
+    def __init__(
+        self,
+    ):
+        self.fastdeploy_commit: str = ""
+        self.paddle_version: str = ""
+        self.paddle_commit: str = ""
+        self.cuda_version: str = ""
+        self.compiler_version: str = ""
+
+        self._load_from_version_file()
+
+    def _load_from_version_file(self, file_path: str = None):
+        """Internal method to load version info from file"""
+        if file_path is None:
+            file_path = os.path.join(fastdeploy.__path__[0], "version.txt")
+        try:
+            with open(file_path, "r") as f:
+                for line in f:
+                    line = line.strip()
+                    if line.startswith("fastdeploy GIT COMMIT ID:"):
+                        self.fastdeploy_commit = line.split(":")[1].strip()
+                    elif line.startswith("Paddle version:"):
+                        self.paddle_version = line.split(":")[1].strip()
+                    elif line.startswith("Paddle GIT COMMIT ID:"):
+                        self.paddle_commit = line.split(":")[1].strip()
+                    elif line.startswith("CUDA version:"):
+                        self.cuda_version = line.split(":")[1].strip()
+                    elif line.startswith("CXX compiler version:"):
+                        self.compiler_version = line.split(":")[1].strip()
+        except FileNotFoundError:
+            logger.info(f"Warning: Version file not found at {file_path}")
+        except Exception as e:
+            logger.info(f"Warning: Could not read version file - {e!s}")
+
+    def print(self):
+        """
+        print all config
+
+        """
+        logger.info("Fasedeploy Commit Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")
+
+
@dataclass
 class FDConfig:
    """
@@ -466,8 +937,9 @@ class FDConfig:
    load_config: LoadConfig = field(default=None, init=True)
    quant_config: Optional[QuantConfigBase] = None
    graph_opt_config: Optional[GraphOptimizationConfig] = None
+    early_stop_config: Optional[EarlyStopConfig] = None
    decoding_config: DecodingConfig = field(default=None, init=True)  # type: ignore
-    kv_cache_config: KVCacheConfig = field(default=None, init=True)  # type: ignore
+    cache_config: CacheConfig = field(default=None, init=True)  # type: ignore

    def __post_init__(self):
        # Initialize cuda graph capture list
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -15,19 +15,24 @@
 """

 import json
+import os
 from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional

-from fastdeploy.engine.config import (
+import paddle
+
+from fastdeploy.config import (
    CacheConfig,
-    Config,
+    EarlyStopConfig,
    GraphOptimizationConfig,
+    LoadConfig,
    ModelConfig,
    ParallelConfig,
    SpeculativeConfig,
    TaskOption,
 )
+from fastdeploy.engine.config import Config
 from fastdeploy.scheduler.config import SchedulerConfig
 from fastdeploy.utils import FlexibleArgumentParser

@@ -314,6 +319,23 @@ class EngineArgs:
    Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
    """

+    enable_early_stop: bool = False
+    """
+    Flag to enable early stop. Default is False (disabled).
+    """
+
+    early_stop_config: Optional[Dict[str, Any]] = None
+    """
+    Configuration for early stop.
+    """
+
+    load_choices: str = "default"
+    """The format of the model weights to load.
+        Options include:
+        - "default": default loader.
+        - "new_loader": new  loader.
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -465,6 +487,18 @@ class EngineArgs:
            default=EngineArgs.enable_logprob,
            help="Enable output of token-level log probabilities.",
        )
+        model_group.add_argument(
+            "--enable-early-stop",
+            action="store_true",
+            default=EngineArgs.enable_early_stop,
+            help="Enable early stopping during generation.",
+        )
+        model_group.add_argument(
+            "--early-stop-config",
+            type=json.loads,
+            default=EngineArgs.early_stop_config,
+            help="the config for early stop.",
+        )

        # Parallel processing parameters group
        parallel_group = parser.add_argument_group("Parallel Configuration")
@@ -519,6 +553,16 @@ class EngineArgs:
            help="Enable expert parallelism.",
        )

+        # Load group
+        load_group = parser.add_argument_group("Load Configuration")
+        load_group.add_argument(
+            "--load_choices",
+            type=str,
+            default=EngineArgs.load_choices,
+            help="The format of the model weights to load.\
+                 default/new_loader.",
+        )
+
        # CacheConfig parameters group
        cache_group = parser.add_argument_group("Cache Configuration")

@@ -758,46 +802,14 @@ class EngineArgs:
        """
        return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})

-    def create_model_config(self) -> ModelConfig:
-        """
-        Create and return a ModelConfig object based on the current settings.
-        """
-        return ModelConfig(
-            model_name_or_path=self.model,
-            config_json_file=self.model_config_name,
-            quantization=self.quantization,
-            dynamic_load_weight=self.dynamic_load_weight,
-            load_strategy=self.load_strategy,
-        )
-
-    def create_cache_config(self, model_cfg) -> CacheConfig:
-        """
-        Create and return a CacheConfig object based on the current settings.
-        """
-        return CacheConfig(
-            block_size=self.block_size,
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            num_gpu_blocks_override=self.num_gpu_blocks_override,
-            kv_cache_ratio=self.kv_cache_ratio,
-            prealloc_dec_block_slot_num_threshold=self.prealloc_dec_block_slot_num_threshold,
-            enable_prefix_caching=self.enable_prefix_caching,
-            swap_space=self.swap_space,
-            cache_queue_port=self.cache_queue_port,
-            model_cfg=model_cfg,
-            enable_chunked_prefill=self.enable_chunked_prefill,
-            enc_dec_block_num=self.static_decode_blocks,
-            rdma_comm_ports=self.rdma_comm_ports,
-            cache_transfer_protocol=self.cache_transfer_protocol,
-            pd_comm_port=self.pd_comm_port,
-        )
-
    def create_speculative_config(self) -> SpeculativeConfig:
        """ """
+        speculative_args = asdict(self)
        if self.speculative_config is not None:
-            return SpeculativeConfig(**self.speculative_config)
-        else:
-            return SpeculativeConfig()
+            for k, v in self.speculative_config.items():
+                speculative_args[k] = v
+
+        return SpeculativeConfig(speculative_args)

    def create_scheduler_config(self) -> SchedulerConfig:
        """
@@ -823,43 +835,60 @@ class EngineArgs:

        return SchedulerConfig(**params)

-    def create_parallel_config(self) -> ParallelConfig:
-        """
-        Create and return a ParallelConfig object based on the current settings.
-        """
-        return ParallelConfig(
-            tensor_parallel_size=self.tensor_parallel_size,
-            enable_expert_parallel=self.enable_expert_parallel,
-            data_parallel_size=self.data_parallel_size,
-            enable_custom_all_reduce=self.enable_custom_all_reduce,
-        )
-
    def create_graph_optimization_config(self) -> GraphOptimizationConfig:
        """
        Create and retuan a GraphOptimizationConfig object based on the current settings.
        """
+        graph_optimization_args = asdict(self)
        if self.graph_optimization_config is not None:
-            return GraphOptimizationConfig(**self.graph_optimization_config)
-        else:
-            return GraphOptimizationConfig()
+            for k, v in self.graph_optimization_config.items():
+                graph_optimization_args[k] = v
+        return GraphOptimizationConfig(graph_optimization_args)
+
+    def create_early_stop_config(self) -> EarlyStopConfig:
+        """
+        Create and retuan an EarlyStopConfig object based on the current settings.
+        """
+        early_stop_args = asdict(self)
+        if self.early_stop_config is not None:
+            for k, v in self.early_stop_config.items():
+                early_stop_args[k] = v
+        return EarlyStopConfig(early_stop_args)

    def create_engine_config(self) -> Config:
        """
        Create and return a Config object based on the current settings.
        """
-        model_cfg = self.create_model_config()
+        all_dict = asdict(self)
+        model_cfg = ModelConfig(all_dict)
+
        if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
            self.tensor_parallel_size = model_cfg.tensor_parallel_size
        if self.max_num_batched_tokens is None:
            if self.enable_chunked_prefill:
                self.max_num_batched_tokens = 2048
            else:
-                self.max_num_batched_tokens = self.max_model_len
+                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+                    self.max_num_batched_tokens = self.max_model_len
+                else:
+                    if paddle.is_compiled_with_xpu():
+                        self.max_num_batched_tokens = self.max_model_len
+                    else:
+                        self.max_num_batched_tokens = 8192
+
+        all_dict = asdict(self)
+        all_dict["model_cfg"] = model_cfg
+        cache_cfg = CacheConfig(all_dict)
+        load_cfg = LoadConfig(all_dict)
+        parallel_cfg = ParallelConfig(all_dict)
        scheduler_cfg = self.create_scheduler_config()
        speculative_cfg = self.create_speculative_config()
        graph_opt_cfg = self.create_graph_optimization_config()
        graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)

+        early_stop_cfg = self.create_early_stop_config()
+        early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
+
        assert not (
            self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
        ), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
@@ -869,8 +898,9 @@ class EngineArgs:
            model_config=model_cfg,
            scheduler_config=scheduler_cfg,
            tokenizer=self.tokenizer,
-            cache_config=self.create_cache_config(model_cfg),
-            parallel_config=self.create_parallel_config(),
+            cache_config=cache_cfg,
+            load_config=load_cfg,
+            parallel_config=parallel_cfg,
            max_model_len=self.max_model_len,
            tensor_parallel_size=self.tensor_parallel_size,
            max_num_seqs=self.max_num_seqs,
@@ -892,4 +922,6 @@ class EngineArgs:
            guided_decoding_backend=self.guided_decoding_backend,
            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
            enable_logprob=self.enable_logprob,
+            early_stop_config=early_stop_cfg,
+            load_choices=self.load_choices,
        )
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -15,592 +15,19 @@

 import json
 import os
-from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Dict, List, Optional

-from fastdeploy import envs
+from fastdeploy.config import (
+    CacheConfig,
+    CommitConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+)
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
-from fastdeploy.utils import (
-    ceil_div,
-    check_unified_ckpt,
-    get_host_ip,
-    is_port_available,
-    llm_logger,
-)
-
-TaskOption = Literal["generate"]
-
-
-class ModelConfig:
-    """
-    Configuration class for the model.
-
-    Attributes:
-        model_dir (str): Directory path to the model.
-        is_unified_ckpt (bool): Flag indicating if the checkpoint is unified.
-        model_name_or_path (str): Name or path of the model.
-    """
-
-    def __init__(
-        self,
-        model_name_or_path: str,
-        config_json_file: str = "config.json",
-        dynamic_load_weight: bool = False,
-        load_strategy: str = "ipc_snapshot",
-        quantization: str = None,
-        download_dir: Optional[str] = None,
-    ):
-        """
-        Initialize the ModelConfig class.
-
-        Args:
-            model_name_or_path (str): Name or path of the model.
-            config_json_file (str): Path to the configuration JSON file. Default is 'config.json'.
-            download_dir (Optional[str]): Directory to download model files. Default is None.
-        """
-        self.model_dir = model_name_or_path
-        self.is_unified_ckpt = check_unified_ckpt(self.model_dir)
-        self.dynamic_load_weight = dynamic_load_weight
-        self.load_strategy = load_strategy
-        self.quantization = quantization
-
-        config_file = os.path.join(model_name_or_path, config_json_file)
-        if os.path.isfile(model_name_or_path):
-            try:
-                from paddleformers.transformers import AutoConfig
-
-                config = AutoConfig.from_pretrained(model_name_or_path)
-                config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
-                for key, value in config_dict.items():
-                    setattr(self, key, value)
-            except Exception:
-                llm_logger.error(
-                    "Don't support the current model, you can use `paddleformers` to register your model."
-                )
-                raise ValueError(
-                    "Don't support the current model, you can use `paddleformers` to register your model."
-                )
-        else:
-            with open(config_file, "r", encoding="utf-8") as f:
-                config_dict = json.load(f)
-                for key, value in config_dict.items():
-                    try:
-                        setattr(self, key, value)
-                    except Exception:
-                        continue
-
-        if isinstance(self.architectures, list):
-            self.architectures = self.architectures[0]
-        self.model_name_or_path = model_name_or_path
-        self.override_name_from_config()
-        self.read_from_env()
-
-    def override_name_from_config(self):
-        """
-        Override attribute names from the exported model's configuration.
-        """
-
-        if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
-            self.tensor_parallel_size = self.infer_model_mp_num
-            del self.infer_model_mp_num
-
-        if hasattr(self, "num_hidden_layers"):
-            if hasattr(self, "remove_tail_layer"):
-                if self.remove_tail_layer is True:
-                    self.num_hidden_layers -= 1
-                elif isinstance(self.remove_tail_layer, int):
-                    self.num_hidden_layers -= self.remove_tail_layer
-
-            self.num_layers = self.num_hidden_layers
-            del self.num_hidden_layers
-
-        if not hasattr(self, "mla_use_absorb"):
-            self.mla_use_absorb = False
-        if not hasattr(self, "head_dim"):
-            assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
-            self.head_dim = self.hidden_size // self.num_attention_heads
-
-    def read_from_env(self):
-        """
-        Read configuration information from environment variables and update the object's attributes.
-
-        If an attribute is not present or is an empty string in the environment variables, use the default value.
-        """
-        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
-        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
-
-        def reset_config_value(key, value):
-            if not hasattr(self, key.lower()):
-                if os.getenv(key, None):
-                    value = eval(os.getenv(key))
-                    llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
-                else:
-                    llm_logger.info(f"Parameter `{key}` will use default value {value}.")
-                setattr(self, key.lower(), value)
-
-        reset_config_value("COMPRESSION_RATIO", 1.0)
-        reset_config_value("ROPE_THETA", 10000)
-
-    def _get_download_model(self, model_name, model_type="default"):
-        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
-        pass
-
-    def print(self):
-        """
-        Print all configuration information.
-        """
-        llm_logger.info("Model Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-
-class CacheConfig:
-    """
-    Configuration for the KV cache.
-
-    Attributes:
-        block_size (int): Size of a cache block in number of tokens.
-        gpu_memory_utilization (float): Fraction of GPU memory to use for model execution.
-        cache_dtype (str): Data type for kv cache storage. Default is 'bfloat16'.
-        num_gpu_blocks_override (Optional[int]): Number of GPU blocks to use.
-        Overrides profiled num_gpu_blocks if provided.
-        kv_cache_ratio (float): Ratio for calculating the maximum block number.
-        enc_dec_block_num (int): Number of encoder-decoder blocks.
-        prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding.
-        enable_prefix_caching (bool): Flag to enable prefix caching.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        gpu_memory_utilization: float,
-        cache_dtype: str = "bfloat16",
-        num_gpu_blocks_override: Optional[int] = None,
-        swap_space: Optional[int] = None,
-        kv_cache_ratio: float = 0.75,
-        enc_dec_block_num: int = 2,
-        prealloc_dec_block_slot_num_threshold: int = 5,
-        tensor_parallel_size: int = 1,
-        enable_prefix_caching=False,
-        enable_ssd_cache=False,
-        model_cfg=None,
-        cache_queue_port=None,
-        enable_chunked_prefill=False,
-        rdma_comm_ports=None,
-        cache_transfer_protocol=None,
-        pd_comm_port=None,
-    ):
-        """
-        Initialize the CacheConfig class.
-
-        Args:
-            block_size (int): Size of a cache block in number of tokens.
-            gpu_memory_utilization (float): Fraction of GPU memory to use.
-            cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
-            num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
-            num_cpu_blocks (Optional[int]): Number of CPU blocks.
-            kv_cache_ratio (float): Ratio for max block calculation.
-            enc_dec_block_num (int): Number of encoder-decoder blocks.
-            prealloc_dec_block_slot_num_threshold (int): Number of token slot threadshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
-            enable_prefix_caching (bool): Enable prefix caching.
-        """
-        self.block_size = block_size
-        self.gpu_memory_utilization = gpu_memory_utilization
-        self.num_gpu_blocks_override = num_gpu_blocks_override
-        self.kv_cache_ratio = kv_cache_ratio
-        self.enc_dec_block_num = enc_dec_block_num
-        self.prealloc_dec_block_slot_num_threshold = prealloc_dec_block_slot_num_threshold
-        self.cache_dtype = cache_dtype
-        if hasattr(model_cfg, "quantization_config"):
-            self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
-
-        self.enable_chunked_prefill = enable_chunked_prefill
-        self.rdma_comm_ports = rdma_comm_ports
-        self.cache_transfer_protocol = cache_transfer_protocol
-        self.pd_comm_port = pd_comm_port
-
-        if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
-            self.rdma_comm_ports = rdma_comm_ports.split(",")
-
-        if pd_comm_port is not None and isinstance(pd_comm_port, str):
-            self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
-
-        self.enable_prefix_caching = enable_prefix_caching
-        if swap_space is None:
-            self.enable_hierarchical_cache = False
-        else:
-            self.enable_hierarchical_cache = True
-
-        self.enable_ssd_cache = enable_ssd_cache
-        self.model_cfg = model_cfg
-        self.cache_queue_port = cache_queue_port
-        self.swap_space = swap_space
-
-        if (
-            hasattr(self.model_cfg, "num_key_value_heads")
-            and hasattr(self.model_cfg, "num_key_value_heads")
-            and self.model_cfg.num_key_value_heads is not None
-            and int(self.model_cfg.num_key_value_heads) > 0
-        ):
-            kv_num_head = int(self.model_cfg.num_key_value_heads)
-        else:
-            kv_num_head = self.model_cfg.num_attention_heads
-        self.model_cfg.kv_num_head = kv_num_head
-
-        # TODO check name
-        if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
-            byte_size = 0.5
-            self.cache_dtype = "uint8"
-        elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
-            self.cache_dtype = "uint8"
-            byte_size = 1
-        else:
-            byte_size = 2
-
-        self.each_token_cache_space = int(
-            self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
-        )
-        self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
-        self.bytes_per_layer_per_block = int(
-            self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
-        )
-
-        if self.swap_space is None:
-            self.num_cpu_blocks = 0
-        else:
-            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
-        self._verify_args()
-
-    def metrics_info(self):
-        """Convert cache_config to dict(key: str, value: str) for prometheus metrics info."""
-        return {key: str(value) for key, value in self.__dict__.items()}
-
-    def _verify_args(self):
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
-        if self.kv_cache_ratio > 1.0:
-            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
-
-    def postprocess(self, num_total_tokens, number_of_tasks):
-        """
-        calculate block num
-        """
-        self.dec_token_num = self.enc_dec_block_num * self.block_size
-        if self.num_gpu_blocks_override is not None:
-            self.total_block_num = self.num_gpu_blocks_override
-            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        else:
-            length = num_total_tokens // number_of_tasks
-            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
-            self.total_block_num = block_num * number_of_tasks
-            self.prefill_kvcache_block_num = self.total_block_num
-            llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
-
-    def reset(self, num_gpu_blocks):
-        """
-        reset gpu block number
-        """
-        self.total_block_num = num_gpu_blocks
-        self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        llm_logger.info(
-            f"Reset block num, the total_block_num:{self.total_block_num},"
-            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
-        )
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Cache Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-
-class SpeculativeConfig:
-    """
-    Speculative Decoding Configuration class.
-
-    Attributes:
-        method (Optional[str]): Method used for speculative decoding.
-        num_speculative_tokens (int): Maximum draft tokens, default is 1.
-        model_name_or_path (Optional[str]): Path of the model.
-        quantization (str): Quantization method for draft model, default is WINT8.
-        max_model_len: Optional[int]: Maximum model length for draft model.
-        benchmark_mode (bool): Whether to use benchmark mode.
-    """
-
-    def __init__(
-        self,
-        method: Optional[str] = None,
-        num_speculative_tokens: Optional[int] = 1,
-        model: Optional[str] = None,
-        quantization: Optional[str] = "WINT8",
-        max_model_len: Optional[int] = None,
-        benchmark_mode: bool = False,
-        **kwargs,
-    ):
-        self.model_name_or_path = model
-        self.method = method
-        self.num_speculative_tokens = num_speculative_tokens
-        self.quantization = quantization
-        self.max_model_len = max_model_len
-        self.benchmark_mode = benchmark_mode
-        # Fixed now
-        self.num_gpu_block_expand_ratio = 1
-        self.num_extra_cache_layer = 0
-
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except Exception:
-                continue
-
-        self.read_model_config()
-        self.reset()
-
-    def read_model_config(self):
-        """
-        Read configuration from file.
-        """
-        self.model_config = {}
-        if not self.enabled_speculative_decoding():
-            return
-
-        self.is_unified_ckpt = check_unified_ckpt(self.model_name_or_path)
-        if self.model_name_or_path is None:
-            return
-
-        self.config_path = os.path.join(self.model_name_or_path, "config.json")
-        if os.path.exists(self.config_path):
-            self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
-
-    def reset(self):
-        """
-        Reset configuration.
-        """
-
-        def reset_value(cls, value_name, key=None, default=None):
-            if key is not None and key in cls.model_config:
-                setattr(cls, value_name, cls.model_config[key])
-            elif getattr(cls, value_name, None) is None:
-                setattr(cls, value_name, default)
-
-        if not self.enabled_speculative_decoding():
-            return
-
-        # NOTE(liuzichang): We will support multi-layer in future
-        if self.method in ["mtp"]:
-            self.num_extra_cache_layer = 1
-
-    def enabled_speculative_decoding(self):
-        """
-        Check if speculative decoding is enabled.
-        """
-        if self.method is None:
-            return False
-        return True
-
-    def to_json_string(self):
-        """
-        Convert speculative_config to json string.
-        """
-        return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Speculative Decoding Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-    def __str__(self) -> str:
-        return self.to_json_string()
-
-
-class GraphOptimizationConfig:
-    def __init__(
-        self,
-        graph_opt_level: Optional[int] = 0,
-        use_cudagraph: Optional[bool] = None,
-        cudagraph_capture_sizes: Optional[List[int]] = None,
-        sot_warmup_sizes: Optional[List[int]] = None,
-        **kwargs,
-    ):
-        """
-        Graph Optimization Configuration class.
-
-        Attributes:
-            graph_opt_level: Compute graph optimization level
-            use_cudagraph: Use CUDA Graph or not
-            cudagraph_capture_sizes: Batch size list will be captured by CUDA Graph
-        """
-        self.check_legality_parameters(graph_opt_level, use_cudagraph, cudagraph_capture_sizes, **kwargs)
-
-        self.graph_opt_level = graph_opt_level
-        self.use_cudagraph = use_cudagraph
-        self.cudagraph_capture_sizes = cudagraph_capture_sizes
-        self.sot_warmup_sizes = [] if sot_warmup_sizes is None else sot_warmup_sizes
-
-    def to_json_string(self):
-        """
-        Convert speculative_config to json string.
-        """
-        return json.dumps({key: value for key, value in self.__dict__.items()})
-
-    def __str__(self) -> str:
-        return self.to_json_string()
-
-    def check_legality_parameters(
-        self,
-        graph_opt_level: Optional[int] = None,
-        use_cudagraph: Optional[bool] = None,
-        cudagraph_capture_sizes: Optional[List[int]] = None,
-        **kwargs,
-    ) -> None:
-        """Check the legality of parameters passed in from the command line"""
-
-        if graph_opt_level is not None:
-            assert graph_opt_level in [
-                0,
-                1,
-                2,
-            ], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
-        if use_cudagraph is not None:
-            assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool."
-        if cudagraph_capture_sizes is not None:
-            assert (
-                type(cudagraph_capture_sizes) is list
-            ), "In graph optimization config, type of cudagraph_capture_sizes must is list."
-            assert (
-                len(cudagraph_capture_sizes) > 0
-            ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
-
-        for key, value in kwargs.items():
-            raise ValueError(f"Invalid --graph-optimization-config parameter {key}")
-
-    def update_use_cudagraph(self, argument: bool):
-        """
-        Unified user specifies the use_cudagraph parameter through two methods,
-        '--use-cudagraph' and '--graph-optimization-config'
-        """
-        if self.use_cudagraph is None:
-            # User only set '--use-cudagraph'
-            self.use_cudagraph = argument
-        else:
-            # User both set '--use-cudagraph' and '--graph-optimization-config'
-            if self.use_cudagraph is False and argument is True:
-                raise ValueError(
-                    "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
-                )
-            argument = self.use_cudagraph
-
-
-class ParallelConfig:
-    """
-    Configuration for parallelism.
-
-    Attributes:
-        tensor_parallel_size (int): Size of tensor parallelism.
-        data_parallel_size (int): Size of data parallelism.
-        local_data_parallel_id (int): ID of local data parallel.
-        enable_expert_parallel (bool): Whether to enable expert parallel.
-    """
-
-    def __init__(
-        self,
-        tensor_parallel_size: int = 1,
-        data_parallel_size: int = 1,
-        enable_expert_parallel: bool = False,
-        enable_custom_all_reduce: bool = False,
-    ):
-        """
-        Initialize the ParallelConfig class.
-
-        Args:
-            tensor_parallel_size (int): Size of tensor parallelism.
-            data_parallel_size (int): Size of data parallelism.
-            local_data_parallel_id (int): ID of local data parallel.
-            enable_expert_parallel (bool): Whether to enable expert parallel.
-        """
-        self.tensor_parallel_size = tensor_parallel_size
-        self.data_parallel_size = data_parallel_size
-        self.enable_expert_parallel = enable_expert_parallel
-        self.expert_parallel_size = data_parallel_size
-        self.local_data_parallel_id = 0
-        self.enable_custom_all_reduce = enable_custom_all_reduce
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Parallel Configuration Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
-
-
-@dataclass
-class CommitConfig:
-    """
-    Configuration for tracking version information from version.txt
-
-    Attributes:
-        fastdeploy_commit: Full FastDeploy git commit hash
-        paddle_version: PaddlePaddle version string
-        paddle_commit: PaddlePaddle git commit hash
-        cuda_version: CUDA version string
-        compiler_version: CXX compiler version string
-    """
-
-    fastdeploy_commit: str = ""
-    paddle_version: str = ""
-    paddle_commit: str = ""
-    cuda_version: str = ""
-    compiler_version: str = ""
-
-    def __post_init__(self):
-        """Automatically load version info when initialized"""
-        self._load_from_version_file()
-
-    def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"):
-        """Internal method to load version info from file"""
-        try:
-            with open(file_path, "r") as f:
-                for line in f:
-                    line = line.strip()
-                    if line.startswith("fastdeploy GIT COMMIT ID:"):
-                        self.fastdeploy_commit = line.split(":")[1].strip()
-                    elif line.startswith("Paddle version:"):
-                        self.paddle_version = line.split(":")[1].strip()
-                    elif line.startswith("Paddle GIT COMMIT ID:"):
-                        self.paddle_commit = line.split(":")[1].strip()
-                    elif line.startswith("CUDA version:"):
-                        self.cuda_version = line.split(":")[1].strip()
-                    elif line.startswith("CXX compiler version:"):
-                        self.compiler_version = line.split(":")[1].strip()
-        except FileNotFoundError:
-            llm_logger.info(f"Warning: Version file not found at {file_path}")
-        except Exception as e:
-            llm_logger.info(f"Warning: Could not read version file - {e!s}")
-
-    def print(self):
-        """
-        print all config
-
-        """
-        llm_logger.info("Fasedeploy Commit Information :")
-        for k, v in self.__dict__.items():
-            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info("=============================================================")
+from fastdeploy.utils import ceil_div, get_host_ip, is_port_available, llm_logger


 class Config:
@@ -627,6 +54,7 @@ class Config:
        splitwise_role (str): Splitwise role.
        innode_prefill_ports (Optional[List[int]]): Innode prefill ports.
            Temporary configuration, will be removed in the future.
+        load_choices(str):The format of the model weights to load. .Default is default
    """

    def __init__(
@@ -635,6 +63,7 @@ class Config:
        cache_config: CacheConfig,
        scheduler_config: SchedulerConfig,
        parallel_config: ParallelConfig,
+        load_config: LoadConfig,
        commit_config: CommitConfig = CommitConfig(),
        model_name_or_path: str = None,
        tokenizer: str = None,
@@ -659,6 +88,8 @@ class Config:
        guided_decoding_backend: Optional[str] = None,
        disable_any_whitespace: bool = False,
        enable_logprob: bool = False,
+        early_stop_config: Optional[Dict[str, Any]] = None,
+        load_choices: str = "default",
    ):
        """
        Initialize the Config class.
@@ -687,11 +118,15 @@ class Config:
            guided_decoding_backend(str): Guided decoding backend. Default is None.
            disable_any_whitespace(bool): Disable any whitespace when using guided decoding.
                Default is False.
+            enable_logprob(bool): Enable logprob. Default is False.
+            early_stop_config (Optional[Dict[str, Any]]): Early stop configuration. Default is None.
+            load_choices(str):The format of the model weights to load. .Default is default
        """
        self.model_config = model_config
        self.cache_config = cache_config
        self.scheduler_config = scheduler_config
        self.parallel_config = parallel_config
+        self.load_config = load_config
        self.commit_config = commit_config
        self.model_name_or_path = model_name_or_path
        self.tokenizer = tokenizer
@@ -731,9 +166,11 @@ class Config:
        self.long_prefill_token_threshold = long_prefill_token_threshold
        self.reasoning_parser = reasoning_parser
        self.graph_optimization_config = graph_optimization_config
+        self.early_stop_config = early_stop_config
        self.guided_decoding_backend = guided_decoding_backend
        self.disable_any_whitespace = disable_any_whitespace
        self._str_to_list("innode_prefill_ports", int)
+        self.load_choices = load_choices

        assert self.splitwise_role in ["mixed", "prefill", "decode"]

@@ -788,6 +225,9 @@ class Config:
        else:
            self.is_master = False

+        if self.tensor_parallel_size <= self.worker_num_per_node:
+            self.is_master = True
+
        import paddle

        self.paddle_commit_id = paddle.version.commit
@@ -796,7 +236,13 @@ class Config:
            if self.cache_config.enable_chunked_prefill:
                self.max_num_batched_tokens = 2048
            else:
-                self.max_num_batched_tokens = self.max_model_len
+                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+                    self.max_num_batched_tokens = self.max_model_len
+                else:
+                    if paddle.is_compiled_with_xpu():
+                        self.max_num_batched_tokens = self.max_model_len
+                    else:
+                        self.max_num_batched_tokens = 8192

        if self.long_prefill_token_threshold == 0:
            self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
@@ -844,10 +290,11 @@ class Config:
        )

        if not self.cache_config.enable_chunked_prefill:
-            assert self.max_num_batched_tokens >= self.max_model_len, (
-                f"max_num_batched_tokens: {self.max_num_batched_tokens} "
-                f"should be larger than or equal to max_model_len: {self.max_model_len}"
-            )
+            if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
+                assert self.max_num_batched_tokens >= self.max_model_len, (
+                    f"max_num_batched_tokens: {self.max_num_batched_tokens} "
+                    f"should be larger than or equal to max_model_len: {self.max_model_len}"
+                )
        else:
            assert self.max_num_batched_tokens >= self.cache_config.block_size, (
                f"max_num_batched_tokens: {self.max_num_batched_tokens} "
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -243,38 +243,38 @@ class LLMEngine:
                self.splitwise_receive_thread.daemon = True
                self.splitwise_receive_thread.start()

-            self.cfg.init_cache_info()
+        self.cfg.init_cache_info()

-            role = self.cfg.splitwise_role
-            host_ip = self.cfg.host_ip
-            disaggregate = self.cfg.disaggregate_info
-            if self.cfg.scheduler_config.name == "splitwise":
-                self.scheduler.start(role, host_ip, disaggregate)
+        role = self.cfg.splitwise_role
+        host_ip = self.cfg.host_ip
+        disaggregate = self.cfg.disaggregate_info
+        if self.cfg.scheduler_config.name == "splitwise":
+            self.scheduler.start(role, host_ip, disaggregate)

-            time.sleep(1)
+        time.sleep(1)

-            if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
-                self.dp_processed = []
-                for i in range(
-                    1,
-                    self.cfg.parallel_config.data_parallel_size // self.cfg.nnode,
-                ):
-                    time.sleep(1)
-                    self.dp_processed.append(
-                        multiprocessing.Process(
-                            target=start_expert_service,
-                            args=(
-                                self.cfg,
-                                i + self.cfg.node_rank * self.cfg.worker_num_per_node,
-                                self.ipc_signal_suffix,
-                            ),
-                        )
+        if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
+            self.dp_processed = []
+            for i in range(
+                1,
+                self.cfg.parallel_config.data_parallel_size // self.cfg.nnode,
+            ):
+                time.sleep(1)
+                self.dp_processed.append(
+                    multiprocessing.Process(
+                        target=start_expert_service,
+                        args=(
+                            self.cfg,
+                            i + self.cfg.node_rank * self.cfg.worker_num_per_node,
+                            self.ipc_signal_suffix,
+                        ),
                    )
-                    llm_logger.info(
-                        f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}"
-                        + f" data parallel id {i}"
-                    )
-                    self.dp_processed[-1].start()
+                )
+                llm_logger.info(
+                    f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}"
+                    + f" data parallel id {i}"
+                )
+                self.dp_processed[-1].start()

        console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
        return True
@@ -373,6 +373,8 @@ class LLMEngine:
                int(self.resource_manager.available_batch()),
                self.cfg.max_prefill_batch,
            )
+
+            self.resource_manager.check_and_free_block_tables()
            tasks = self.scheduler.get_requests(
                available_blocks=self.resource_manager.available_block_num(),
                block_size=self.cfg.cache_config.block_size,
@@ -422,7 +424,7 @@ class LLMEngine:
                else:
                    err, data = self.zmq_server.receive_pyobj_once(block)
                if err is not None:
-                    llm_logger.error("Engine stops inserting zmq task into scheduler")
+                    llm_logger.error("Engine stops inserting zmq task into scheduler, err:{err}")
                    break

                request, insert_task = None, []
@@ -491,6 +493,7 @@ class LLMEngine:
        request = Request.from_dict(task)
        llm_logger.info(f"Receive request {request}")
        if sampling_params is not None:
+            sampling_params.update_from_tokenizer(self.data_processor.tokenizer)
            request.sampling_params = sampling_params
        request.preprocess_start_time = time.time()

@@ -499,6 +502,7 @@ class LLMEngine:
            enable_thinking = kwargs.get("enable_thinking", None)
        request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking)
        request.prompt_token_ids_len = len(request.prompt_token_ids)
+        request.need_prefill_tokens = request.prompt_token_ids_len
        input_ids_len = request.prompt_token_ids_len
        request.set(
            "max_tokens",
@@ -526,6 +530,26 @@ class LLMEngine:
            llm_logger.error(error_msg)
            raise EngineError(error_msg, error_code=400)

+        if request.get("stop_seqs_len") is not None:
+            stop_seqs_len = request.get("stop_seqs_len")
+            max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
+            if len(stop_seqs_len) > max_stop_seqs_num:
+                error_msg = (
+                    f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
+                    "Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
+                )
+                llm_logger.error(error_msg)
+                raise EngineError(error_msg, error_code=400)
+            stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
+            for single_stop_seq_len in stop_seqs_len:
+                if single_stop_seq_len > stop_seqs_max_len:
+                    error_msg = (
+                        f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
+                        "Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
+                    )
+                    llm_logger.error(error_msg)
+                    raise EngineError(error_msg, error_code=400)
+
        if self.guided_decoding_checker is not None:
            request, err_msg = self.guided_decoding_checker.schema_format(request)
            if err_msg is not None:
@@ -745,8 +769,6 @@ class LLMEngine:
        """
        Insert tasks to engine.
        """
-        for task in tasks:
-            start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
        # TODO 返回至 scheduler
        if allocated:
            current_tasks = []
@@ -773,6 +795,11 @@ class LLMEngine:
            self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
            return True

+        for task in tasks:
+            start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
+            if task.sampling_params.bad_words is not None:
+                task.sampling_params.update_from_tokenizer(self.data_processor.tokenizer)
+
        self.resource_manager.check_and_free_block_tables()

        if not isinstance(tasks, list):
@@ -1000,7 +1027,10 @@ class LLMEngine:
            "FLAGS_use_append_attn": 1,
            "NCCL_ALGO": "Ring",
            "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 32768)),
-            "FLAGS_hardamard_moe_block_size": 128,
+            "FLAGS_hardamard_moe_block_size": int(os.getenv("FLAGS_hardamard_moe_block_size", 128)),
+            "FLAGS_hardamard_use_diagonal_block_matrix": int(
+                os.getenv("FLAGS_hardamard_use_diagonal_block_matrix", 0)
+            ),
        }
        # environment variables needed by Dy2St
        variables.update(
@@ -1061,7 +1091,7 @@ class LLMEngine:
            f" --devices {self.cfg.device_ids} {py_script}"
            f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
            f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
-            f" --model_name_or_path {self.cfg.model_name_or_path!s}"
+            f" --model {self.cfg.model_name_or_path!s}"
            f" --device_ids {self.cfg.device_ids}"
            f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
            f" --engine_worker_queue_port {self.cfg.engine_worker_queue_port!s}"
@@ -1076,16 +1106,15 @@ class LLMEngine:
            f" --splitwise_role {self.cfg.splitwise_role}"
            f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
            f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
+            f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
            f" --quantization {self.cfg.model_config.quantization}"
            f" --ori_vocab_size {ori_vocab_size}"
-            f" --speculative_method {self.cfg.speculative_config.method}"
-            f" --speculative_max_draft_token_num {self.cfg.speculative_config.num_speculative_tokens}"
-            f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}"
-            f" --speculative_model_quantization {self.cfg.speculative_config.quantization}"
-            f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
+            f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
            f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
            f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
-            f" --load_strategy {self.cfg.model_config.load_strategy}"
+            f" --load_strategy {self.cfg.load_config.load_strategy}"
+            f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
+            f" --load_choices {self.cfg.load_choices}"
        )

        worker_append_flag = {
@@ -1093,7 +1122,7 @@ class LLMEngine:
            "enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
            "enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
            "do_profile": self.do_profile,
-            "dynamic_load_weight": self.cfg.model_config.dynamic_load_weight,
+            "dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
            "disable_any_whitespace": self.cfg.disable_any_whitespace,
            "enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
            "enable_logprob": self.cfg.enable_logprob,
@@ -1232,9 +1261,9 @@ class LLMEngine:
                elif (match := re.search(r"Start load layer (\d+)", line)) or (
                    match := re.search(r"set state for layer (\d+)", line)
                ):
-                    progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_layers
+                    progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_hidden_layers
                    self.worker_init_status["layer_loadding"] = progress
-                    if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_layers - 1:
+                    if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_hidden_layers - 1:
                        self.worker_init_status["finished"] = True

        self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True)
--- a/fastdeploy/engine/expert_service.py
+++ b/fastdeploy/engine/expert_service.py
@@ -50,9 +50,10 @@ class ExpertService:
            cfg (Config): Config object containing all the configuration parameters.
        """
        self.cfg = cfg
-        start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
-        end_pos = ((local_data_parallel_id + 1) * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node
-        self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
+        start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % cfg.worker_num_per_node
+        end_pos = start_pos + self.cfg.tensor_parallel_size
+        if cfg.splitwise_role != "mixed":
+            self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
        self.cfg.local_device_ids = self.cfg.device_ids.split(",")[start_pos:end_pos]
        self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
        self.cfg.disaggregate_info = None
@@ -78,11 +79,13 @@ class ExpertService:
            cfg.splitwise_role,
            local_data_parallel_id,
        )
-
-        if len(self.cfg.cache_config.pd_comm_port) == 1:
-            self.cfg.cache_config.pd_comm_port[0] = int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id
-        else:
-            self.cfg.cache_config.pd_comm_port = [self.cfg.cache_config.pd_comm_port[local_data_parallel_id]]
+        if cfg.splitwise_role != "mixed":
+            if len(self.cfg.cache_config.pd_comm_port) == 1:
+                self.cfg.cache_config.pd_comm_port[0] = (
+                    int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id
+                )
+            else:
+                self.cfg.cache_config.pd_comm_port = [self.cfg.cache_config.pd_comm_port[local_data_parallel_id]]

        self.split_connector = SplitwiseConnector(
            self.cfg,
@@ -119,15 +122,16 @@ class ExpertService:
        start_time = time.time()

        llm_logger.info(f"start expert service {local_data_parallel_id}")
-
-        self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
-            cache_config=self.cfg.cache_config,
-            tensor_parallel_size=self.cfg.tensor_parallel_size,
-            device_ids=self.cfg.local_device_ids,
-            pod_ip=self.cfg.master_ip,
-            engine_worker_queue_port=self.cfg.engine_worker_queue_port,
-            pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}",
-        )
+        if self.cfg.splitwise_role != "mixed":
+            self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager(
+                cache_config=self.cfg.cache_config,
+                tensor_parallel_size=self.cfg.tensor_parallel_size,
+                device_ids=self.cfg.local_device_ids,
+                pod_ip=self.cfg.pod_ips[0],
+                engine_worker_queue_port=self.cfg.engine_worker_queue_port,
+                pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}",
+            )
+            self.split_mode_get_tasks()

        self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, args=())
        self.insert_task_to_worker_thread.daemon = True
@@ -138,8 +142,6 @@ class ExpertService:

        self.token_processor.run()

-        self.split_mode_get_tasks()
-
        self.cfg.init_cache_info()

        role = self.cfg.splitwise_role
@@ -321,13 +323,13 @@ class ExpertService:
                else:
                    is_prefill = True
            self.token_processor.number_of_input_tokens += tasks[i].prompt_token_ids_len
-
-        self.split_connector.send_cache_infos(tasks, current_id)
+        if is_decode or is_prefill:
+            self.split_connector.send_cache_infos(tasks, current_id)
        for task in tasks:
            task.infer_start_time = time.time()
        if not is_decode:
            llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
-            if not is_prefill:
+            if not is_prefill and self.cfg.cache_config.enable_chunked_prefill:
                if not self.cfg.enable_mm:
                    self.update_requests_chunk_size(tasks)
                else:
--- a/fastdeploy/engine/request.py
+++ b/fastdeploy/engine/request.py
@@ -25,7 +25,7 @@ import numpy as np

 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.utils import data_processor_logger
-from fastdeploy.worker.output import LogprobsLists
+from fastdeploy.worker.output import LogprobsLists, SampleLogprobs


 class RequestStatus(Enum):
@@ -60,6 +60,7 @@ class Request:
        preprocess_end_time: Optional[float] = None,
        multimodal_inputs: Optional[dict] = None,
        multimodal_data: Optional[dict] = None,
+        disable_chat_template: bool = False,
        disaggregate_info: Optional[dict] = None,
        draft_token_ids: Optional[list[int]] = None,
        guided_json: Optional[Any] = None,
@@ -87,6 +88,7 @@ class Request:
        self.arrival_time = arrival_time
        self.preprocess_start_time = preprocess_start_time
        self.preprocess_end_time = preprocess_end_time
+        self.disable_chat_template = disable_chat_template
        self.disaggregate_info = disaggregate_info

        # speculative method in disaggregate-mode
@@ -103,6 +105,7 @@ class Request:
        # Multi-modal related
        self.multimodal_inputs = multimodal_inputs
        self.multimodal_data = multimodal_data
+        self.multimodal_img_boundaries = None

        self.enable_thinking = enable_thinking
        self.trace_carrier = trace_carrier
@@ -115,6 +118,7 @@ class Request:
        self.status = RequestStatus.WAITING
        self.task_type = RequestType.PREFILL
        self.idx = None
+        self.need_prefill_tokens = self.prompt_token_ids_len

    @classmethod
    def from_dict(cls, d: dict):
@@ -136,6 +140,7 @@ class Request:
            preprocess_end_time=d.get("preprocess_end_time"),
            multimodal_inputs=d.get("multimodal_inputs"),
            multimodal_data=d.get("multimodal_data"),
+            disable_chat_template=d.get("disable_chat_template"),
            disaggregate_info=d.get("disaggregate_info"),
            draft_token_ids=d.get("draft_token_ids"),
            guided_json=d.get("guided_json", None),
@@ -180,6 +185,7 @@ class Request:
            "preprocess_end_time": self.preprocess_end_time,
            "multimodal_inputs": self.multimodal_inputs,
            "multimodal_data": self.multimodal_data,
+            "disable_chat_template": self.disable_chat_template,
            "disaggregate_info": self.disaggregate_info,
            "draft_token_ids": self.draft_token_ids,
            "enable_thinking": self.enable_thinking,
@@ -239,6 +245,7 @@ class CompletionOutput:
    token_ids: list[int]
    logprob: Optional[float] = None
    top_logprobs: Optional[LogprobsLists] = None
+    logprobs: Optional[SampleLogprobs] = None
    draft_token_ids: list[int] = None
    text: Optional[str] = None
    reasoning_content: Optional[str] = None
@@ -253,6 +260,7 @@ class CompletionOutput:
            "token_ids": self.token_ids,
            "logprob": self.logprob,
            "top_logprobs": self.top_logprobs,
+            "logprobs": self.logprobs,
            "draft_token_ids": self.draft_token_ids,
            "text": self.text,
            "reasoning_content": self.reasoning_content,
@@ -275,7 +283,8 @@ class CompletionOutput:
            f"text={self.text!r}, "
            f"token_ids={self.token_ids}, "
            f"draft_token_ids={self.draft_token_ids}, "
-            f"reasoning_content={self.reasoning_content!r}"
+            f"reasoning_content={self.reasoning_content!r}, "
+            f"logprobs={self.logprobs}, "
        )


@@ -384,16 +393,20 @@ class RequestOutput:

    def add(self, next_output: RequestOutput) -> None:
        """Merge RequestOutput into this one"""
-
        self.prompt = next_output.prompt
        self.prompt_token_ids = next_output.prompt_token_ids
        self.finished |= next_output.finished
        self.outputs.index = next_output.outputs.index
        self.outputs.token_ids.extend(next_output.outputs.token_ids)
+
        if next_output.metrics.arrival_time is not None and self.metrics.inference_start_time is not None:
            self.metrics.model_forward_time = next_output.metrics.arrival_time - self.metrics.inference_start_time
        if next_output.metrics.arrival_time is not None and self.metrics.arrival_time is not None:
            self.metrics.model_execute_time = next_output.metrics.arrival_time - self.metrics.arrival_time
+        if next_output.outputs.top_logprobs is not None:
+            self.outputs.top_logprobs.logprob_token_ids.extend(next_output.outputs.top_logprobs.logprob_token_ids)
+            self.outputs.top_logprobs.logprobs.extend(next_output.outputs.top_logprobs.logprobs)
+            self.outputs.top_logprobs.sampled_token_ranks.extend(next_output.outputs.top_logprobs.sampled_token_ranks)

    def __repr__(self) -> str:
        return (
@@ -401,8 +414,9 @@ class RequestOutput:
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"outputs={self.outputs}, "
+            f"finished={self.finished}, "
+            f"num_cached_tokens={self.num_cached_tokens}, "
            f"metrics={self.metrics}, "
-            f"num_cached_tokens={self.num_cached_tokens})"
        )

    @classmethod
--- a/fastdeploy/engine/sampling_params.py
+++ b/fastdeploy/engine/sampling_params.py
@@ -20,6 +20,8 @@ import random
 from dataclasses import dataclass, fields
 from typing import Any, List, Optional, Union

+from fastdeploy.utils import llm_logger as logger
+

@dataclass
 class SamplingParams:
@@ -90,12 +92,14 @@ class SamplingParams:
    min_p: float = 0.0
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
-    stop_token_ids: Optional[Union[List[List[int]], List[int]]] = None
+    stop_token_ids: Optional[List[int]] = None
+    stop_seqs_len: Optional[int] = None
    max_tokens: Optional[int] = None
    reasoning_max_tokens: Optional[int] = None
    min_tokens: int = 1
    logprobs: Optional[int] = None
    bad_words: Optional[List[str]] = None
+    _bad_words_token_ids: Optional[List[int]] = None

    @classmethod
    def from_dict(cls, req_dict: dict[str, Any]) -> SamplingParams:
@@ -200,11 +204,44 @@ class SamplingParams:
            raise ValueError("seed must be in [0, 922337203685477580], got " f"{self.seed}.")

    def update_from_tokenizer(self, tokenizer):
-        """
-        # TODO: Implement stop tokens and bad words support
-        # Currently stop tokens and bad words are not supported yet
-        """
-        pass
+        """Support bad words"""
+        if self.bad_words is None:
+            return
+        self._bad_words_token_ids = []
+        for bad_word in self.bad_words:
+            # To prohibit words both at the beginning
+            # and in the middle of text
+            # (related to add_prefix_space tokenizer parameter)
+            for add_prefix_space in [False, True]:
+                prefix = " " if add_prefix_space else ""
+                prompt = prefix + bad_word.lstrip()
+                prompt_token_ids = tokenizer.encode(text=prompt, add_special_tokens=False)["input_ids"]
+
+                if len(prompt_token_ids) != 1:
+                    if not add_prefix_space:
+                        logger.warning(
+                            f"Skip bad_words: <{prompt}>."
+                            f"Bad words should be a single token."
+                            f"Got tokens: {prompt_token_ids}."
+                        )
+                    continue
+
+                if prompt_token_ids[0] > tokenizer.vocab_size:
+                    if not add_prefix_space:
+                        logger.warning(
+                            f"Skip bad_words: <{prompt}>."
+                            f"All token id values should be satisfying:"
+                            f" 0 <= token_id < {tokenizer.vocab_size}."
+                            f"Got token: {prompt_token_ids}."
+                        )
+                    continue
+
+                if prompt_token_ids not in self._bad_words_token_ids:
+                    self._bad_words_token_ids.extend(prompt_token_ids)
+
+    @property
+    def bad_words_token_ids(self) -> Optional[List[list[int]]]:
+        return self._bad_words_token_ids


@dataclass
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import threading
 import time
 from collections import deque
@@ -6,6 +22,9 @@ from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import Union

+import numpy as np
+import paddle
+
 from fastdeploy.engine.request import Request, RequestStatus, RequestType
 from fastdeploy.engine.resource_manager import ResourceManager
 from fastdeploy.utils import llm_logger
@@ -56,11 +75,13 @@ class ResourceManagerV1(ResourceManager):
        self.running: list[Request] = []
        self.finish_execution_pool = ThreadPoolExecutor(max_workers=1)
        self.lock = threading.Lock()
+        self.to_be_rescheduled_request_id_set = set()

    def allocated_slots(self, request: Request):
        return len(request.block_tables) * self.config.cache_config.block_size

    def get_new_block_nums(self, request: Request, num_new_tokens: int):
+        self.check_and_free_block_tables()
        return (
            request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1
        ) // self.config.cache_config.block_size - len(request.block_tables)
@@ -77,6 +98,13 @@ class ResourceManagerV1(ResourceManager):
    def _prepare_preempt_task(self, request):
        return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id)

+    def reschedule_preempt_task(self, request_id):
+        with self.lock:
+            if request_id in self.to_be_rescheduled_request_id_set and request_id in self.requests:
+                request = self.requests[request_id]
+                self.waiting.appendleft(request)
+                self.to_be_rescheduled_request_id_set.remove(request_id)
+
    def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs):
        can_schedule = True
        while True:
@@ -84,8 +112,9 @@ class ResourceManagerV1(ResourceManager):
                preempted_req = self.running.pop()
                preempted_req.status = RequestStatus.PREEMPTED
                preempted_req.num_computed_tokens = 0
+                preempted_req.prefill_block_num = 0
                self._free_blocks(preempted_req)
-                self.waiting.appendleft(preempted_req)
+                self.to_be_rescheduled_request_id_set.add(preempted_req.request_id)
                preempted_reqs.append(preempted_req)
                scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
                if preempted_req == request:
@@ -98,6 +127,91 @@ class ResourceManagerV1(ResourceManager):
                break
        return can_schedule

+    def _get_num_new_tokens(self, request, token_budget):
+        num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
+        num_new_tokens = min(num_new_tokens, token_budget)
+
+        if not self.config.enable_mm:
+            return num_new_tokens
+
+        inputs = request.multimodal_inputs
+        request.with_image = False
+        # Compatible with scenarios without images and videos.
+        if inputs["images"] is None:
+            return num_new_tokens
+
+        input_ids_lst = request.prompt_token_ids + request.output_token_ids
+        input_ids = paddle.to_tensor(input_ids_lst, dtype="int64")
+        input_ids = paddle.to_tensor(input_ids_lst, dtype="int64")
+        image_patch_id = inputs["image_patch_id"]
+
+        if request.multimodal_img_boundaries is None:
+            grid_thw = []
+            for one in inputs["grid_thw"]:
+                if one[0] == 1:
+                    grid_thw.append(one)
+                else:
+                    grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2))
+
+            grid_thw = paddle.to_tensor(grid_thw, dtype="int64")
+            from fastdeploy.model_executor.ops.gpu import get_img_boundaries
+
+            request.multimodal_img_boundaries = get_img_boundaries(
+                task_input_ids=input_ids, grid_thw=grid_thw, image_patch_id=image_patch_id
+            ).numpy()
+
+            grid_thw = grid_thw.numpy().reshape([-1, 3])
+            inputs["grid_thw"] = grid_thw
+
+        grid_thw = inputs["grid_thw"]
+        img_boundaries_idx = request.multimodal_img_boundaries[0]
+        img_num_per_boundary = request.multimodal_img_boundaries[1]
+        ori_prompt_len = img_boundaries_idx[-1].item()
+        pre_end_idx = request.num_computed_tokens
+        new_end_idx = pre_end_idx + num_new_tokens
+        if new_end_idx < ori_prompt_len and input_ids[new_end_idx - 1] == image_patch_id:
+            boundary_idx = np.searchsorted(img_boundaries_idx, new_end_idx, side="left").item()
+            if boundary_idx == len(img_boundaries_idx):
+                new_end_idx = ori_prompt_len
+            else:
+                new_end_idx = img_boundaries_idx[boundary_idx].item()
+        elif new_end_idx >= ori_prompt_len and paddle.sum(input_ids[pre_end_idx:new_end_idx] == image_patch_id):
+            new_end_idx = ori_prompt_len
+        num_new_tokens = new_end_idx - pre_end_idx
+
+        image_mask = input_ids[pre_end_idx:new_end_idx] == image_patch_id
+        request.with_image = image_mask.any()
+        if request.with_image:
+            pre_boundary_idx = np.searchsorted(img_boundaries_idx, pre_end_idx, side="left").item()
+            if pre_boundary_idx == len(img_boundaries_idx):
+                request.num_image_start = img_num_per_boundary[-1]
+            else:
+                pre_boundary_idx = (
+                    pre_boundary_idx if pre_end_idx == img_boundaries_idx[pre_boundary_idx] else pre_boundary_idx - 1
+                )
+                request.num_image_start = img_num_per_boundary[pre_boundary_idx]
+
+            new_boundary_idx = np.searchsorted(img_boundaries_idx, new_end_idx, side="left").item()
+            if new_boundary_idx == len(img_boundaries_idx):
+                request.num_image_end = img_num_per_boundary[-1]
+            else:
+                new_boundary_idx = (
+                    new_boundary_idx if new_end_idx == img_boundaries_idx[new_boundary_idx] else new_boundary_idx - 1
+                )
+                request.num_image_end = img_num_per_boundary[new_boundary_idx]
+
+            request.image_type_ids_start = np.sum(grid_thw[: request.num_image_start, 0])
+            request.image_type_ids_end = np.sum(grid_thw[: request.num_image_end, 0])
+            request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))
+            request.image_end = np.sum(np.prod(grid_thw[: request.num_image_end], axis=1))
+        return num_new_tokens
+
+    def exist_prefill(self, scheduled_reqs):
+        for request in scheduled_reqs:
+            if request.task_type == RequestType.PREFILL:
+                return True
+        return False
+
    def schedule(self):
        with self.lock:
            scheduled_reqs: list[Request] = []
@@ -109,9 +223,17 @@ class ResourceManagerV1(ResourceManager):
            num_decoding_req_nums = 0
            while req_index < len(self.running) and token_budget > 0:
                request = self.running[req_index]
-                if request.num_computed_tokens >= request.prompt_token_ids_len:  # to be decoding
-                    if request.num_total_tokens > request.prompt_token_ids_len:  # has generated tokens
+                if request.num_computed_tokens >= request.need_prefill_tokens:  # to be decoding
+                    if request.num_total_tokens > request.need_prefill_tokens:  # has generated tokens
                        request.num_computed_tokens = request.num_total_tokens - 1
+                    else:  # prefill finished
+                        if (
+                            self.config.cache_config.enable_prefix_caching
+                            and request.get("prefill_block_num", None) is None
+                        ):
+                            # update prefill cache blocks for prefix caching
+                            request.prefill_block_num = len(request.block_tables)
+                            self.cache_manager.update_cache_blocks(request, self.config.cache_config.block_size)
                    if (
                        self.allocated_slots(request) - request.num_total_tokens
                        <= self.config.cache_config.prealloc_dec_block_slot_num_threshold
@@ -143,10 +265,9 @@ class ResourceManagerV1(ResourceManager):
                        token_budget -= 1
                else:  # need to prefill
                    llm_logger.debug(
-                        f"scheduler prefill task: {request} request.prompt_token_ids_len {request.prompt_token_ids_len} request.num_computed_tokens {request.num_computed_tokens}"
+                        f"scheduler prefill task: {request} request.need_prefill_tokens {request.need_prefill_tokens} request.num_computed_tokens {request.num_computed_tokens}"
                    )
-                    num_new_tokens = request.prompt_token_ids_len - request.num_computed_tokens
-                    num_new_tokens = min(num_new_tokens, token_budget)
+                    num_new_tokens = self._get_num_new_tokens(request, token_budget)
                    num_new_block = self.get_new_block_nums(request, num_new_tokens)
                    # Allocate blocks to prefill
                    if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
@@ -168,14 +289,22 @@ class ResourceManagerV1(ResourceManager):
                while self.waiting and token_budget > 0:
                    if len(self.running) == self.max_num_seqs:
                        break
+                    if (self.config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(scheduled_reqs):
+                        break
                    request = self.waiting[0]
                    if request.status == RequestStatus.WAITING:
-                        num_new_tokens = request.num_total_tokens - request.num_computed_tokens
-                        num_new_tokens = min(num_new_tokens, token_budget)
+                        # Enable prefix caching
+                        if self.config.cache_config.enable_prefix_caching:
+                            success = self.get_prefix_cached_blocks(request)
+                            if not success:
+                                break
+
+                        num_new_tokens = self._get_num_new_tokens(request, token_budget)
                        num_new_block = self.get_new_block_nums(request, num_new_tokens)
                        # Allocate blocks to prefill
                        if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
-                            request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
+                            if not request.get("skip_allocate", False):
+                                request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
                            self.waiting.popleft()
                            self.running.append(request)
                            scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
@@ -192,8 +321,10 @@ class ResourceManagerV1(ResourceManager):
                        else:
                            break
                    elif request.status == RequestStatus.PREEMPTED:
-                        num_new_tokens = request.num_total_tokens - request.num_computed_tokens
-                        num_new_tokens = min(num_new_tokens, token_budget)
+                        request.need_prefill_tokens = (
+                            request.num_total_tokens
+                        )  # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
+                        num_new_tokens = self._get_num_new_tokens(request, token_budget)
                        num_new_block = self.get_new_block_nums(request, num_new_tokens)
                        # Allocate blocks to prefill
                        if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
@@ -227,12 +358,52 @@ class ResourceManagerV1(ResourceManager):
                break
        return self.real_bsz

+    def get_prefix_cached_blocks(self, request: Request):
+        """
+        set prefix cached information for the given request
+        """
+        try:
+            cache_prepare_time = time.time()
+            (common_block_ids, matched_token_num, hit_info) = self.cache_manager.request_match_blocks(
+                request, self.config.cache_config.block_size
+            )
+
+            matched_block_num = len(common_block_ids)
+            no_cache_block_num = self.cache_manager.get_required_block_num(
+                request.prompt_token_ids_len - matched_token_num,
+                self.config.cache_config.block_size,
+            )
+
+            request.num_cached_tokens = matched_token_num
+            request.gpu_cache_token_num = hit_info["gpu_cache_blocks"] * self.config.cache_config.block_size
+            request.cpu_cache_token_num = hit_info["cpu_cache_blocks"] * self.config.cache_config.block_size
+            request.cache_info = (matched_block_num, no_cache_block_num)
+            request.block_tables = common_block_ids
+            request.skip_allocate = False
+
+            if matched_token_num == request.prompt_token_ids_len:
+                request.num_computed_tokens = matched_token_num - 1
+                request.skip_allocate = True
+            else:
+                request.num_computed_tokens = matched_token_num
+            request.cache_prepare_time = time.time() - cache_prepare_time
+            return True
+        except Exception as e:
+            llm_logger.error(f"prefix match blocks error: {e}, waiting reschedule...")
+            return False
+
    def add_request(self, request: Request) -> None:
-        self.waiting.append(request)
-        self.requests[request.request_id] = request
+        with self.lock:
+            self.waiting.append(request)
+            self.requests[request.request_id] = request

    def _free_blocks(self, request: Request):
-        self.cache_manager.recycle_gpu_blocks(request.block_tables)
+        if self.config.cache_config.enable_prefix_caching:
+            # TODO(chengyanfu): support cache ouput blocks for prefix caching
+            self.cache_manager.release_block_ids_async(request)
+            self.cache_manager.recycle_gpu_blocks(request.block_tables[request.prefill_block_num :])
+        else:
+            self.cache_manager.recycle_gpu_blocks(request.block_tables)
        request.block_tables = []

    def finish_requests_async(self, request_ids: Union[str, Iterable[str]]):
@@ -251,9 +422,20 @@ class ResourceManagerV1(ResourceManager):
                    if request is None:
                        # Invalid request ID.
                        continue
-                    request.status = RequestStatus.FINISHED
-                    self.running.remove(request)
-                    self._free_blocks(request)
+                    if request in self.running:  # normally run and finished
+                        self.running.remove(request)
+                        request.status = RequestStatus.FINISHED
+                        self._free_blocks(request)
+                    if (
+                        request.request_id in self.to_be_rescheduled_request_id_set
+                    ):  # finished after preempted, blocks have been recycled.
+                        self.to_be_rescheduled_request_id_set.remove(
+                            request.request_id
+                        )  # just remove from to_be_rescheduled_request_id_set
+                    if (
+                        request in self.waiting
+                    ):  # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here
+                        raise RuntimeError(f"request {request.request_id} scheduled into waiting list, after finished")
                    self.tasks_list[request.idx] = None
                    self.stop_flags[request.idx] = True
                    del self.requests[req_id]
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -19,11 +19,12 @@ import uuid

 import numpy as np

+from fastdeploy import envs
 from fastdeploy.input.preprocess import InputPreprocessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqClient
 from fastdeploy.metrics.work_metrics import work_process_metrics
 from fastdeploy.platforms import current_platform
-from fastdeploy.utils import EngineError, api_server_logger
+from fastdeploy.utils import EngineError, StatefulSemaphore, api_server_logger


 class EngineClient:
@@ -42,6 +43,8 @@ class EngineClient:
        enable_mm=False,
        reasoning_parser=None,
        data_parallel_size=1,
+        enable_logprob=False,
+        workers=1,
    ):
        input_processor = InputPreprocessor(
            tokenizer,
@@ -50,6 +53,7 @@ class EngineClient:
            mm_processor_kwargs,
            enable_mm,
        )
+        self.enable_logprob = enable_logprob
        self.enable_mm = enable_mm
        self.reasoning_parser = reasoning_parser
        self.data_processor = input_processor.create_processor()
@@ -73,6 +77,7 @@ class EngineClient:
            suffix=pid,
            create=False,
        )
+        self.semaphore = StatefulSemaphore((envs.FD_SUPPORT_MAX_CONNECTIONS + workers - 1) // workers)

    def create_zmq_client(self, model, mode):
        """
@@ -142,6 +147,26 @@ class EngineClient:
            api_server_logger.error(error_msg)
            raise EngineError(error_msg, error_code=400)

+        if "stop_seqs_len" in task:
+            stop_seqs_len = task["stop_seqs_len"]
+            max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
+            if len(stop_seqs_len) > max_stop_seqs_num:
+                error_msg = (
+                    f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
+                    "Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
+                )
+                api_server_logger.error(error_msg)
+                raise EngineError(error_msg, error_code=400)
+            stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
+            for single_stop_seq_len in stop_seqs_len:
+                if single_stop_seq_len > stop_seqs_max_len:
+                    error_msg = (
+                        f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
+                        "Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
+                    )
+                    api_server_logger.error(error_msg)
+                    raise EngineError(error_msg, error_code=400)
+
        task["preprocess_end_time"] = time.time()
        preprocess_cost_time = task["preprocess_end_time"] - task["preprocess_start_time"]
        api_server_logger.info(
@@ -200,6 +225,44 @@ class EngineClient:
        if data.get("stream_options") and not data.get("stream"):
            raise ValueError("Stream options can only be defined when `stream=True`.")

+        # logprobs
+        logprobs = data.get("logprobs")
+        top_logprobs = None
+
+        if isinstance(logprobs, bool) and logprobs:
+            if not self.enable_logprob:
+                err_msg = "Logprobs is disabled, please enable it in startup config."
+                api_server_logger.error(err_msg)
+                raise ValueError(err_msg)
+            top_logprobs = data.get("top_logprobs")
+        elif isinstance(logprobs, int):
+            top_logprobs = logprobs
+        elif logprobs:
+            raise ValueError("Invalid type for 'logprobs'")
+
+        # enable_logprob
+        if top_logprobs:
+            if not self.enable_logprob:
+                err_msg = "Logprobs is disabled, please enable it in startup config."
+                api_server_logger.error(err_msg)
+                raise ValueError(err_msg)
+
+            if not isinstance(top_logprobs, int):
+                err_type = type(top_logprobs).__name__
+                err_msg = f"Invalid type for 'top_logprobs': expected int but got {err_type}."
+                api_server_logger.error(err_msg)
+                raise ValueError(err_msg)
+
+            if top_logprobs < 0:
+                err_msg = f"Invalid 'top_logprobs': must be >= 0, got {top_logprobs}."
+                api_server_logger.error(err_msg)
+                raise ValueError(err_msg)
+
+            if top_logprobs > 20:
+                err_msg = "Invalid value for 'top_logprobs': must be <= 20."
+                api_server_logger.error(err_msg)
+                raise ValueError(err_msg)
+
    def check_health(self, time_interval_threashold=30):
        """
        Check the health of the model server by checking whether all workers are alive.
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -31,6 +31,7 @@ from fastdeploy.engine.sampling_params import SamplingParams

 # from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam
 from fastdeploy.utils import llm_logger, retrive_model_from_server
+from fastdeploy.worker.output import Logprob, LogprobsLists

 root_logger = logging.getLogger()
 for handler in root_logger.handlers[:]:
@@ -68,12 +69,14 @@ class LLM:
        model: str,
        revision: Optional[str] = "master",
        tokenizer: Optional[str] = None,
+        enable_logprob: Optional[bool] = False,
        **kwargs,
    ):
        model = retrive_model_from_server(model, revision)
        engine_args = EngineArgs(
            model=model,
            tokenizer=tokenizer,
+            enable_logprob=enable_logprob,
            **kwargs,
        )

@@ -169,8 +172,10 @@ class LLM:

        req_ids = self._add_request(prompts=prompts, sampling_params=sampling_params)

+        topk_logprobs = sampling_params[0].logprobs if sampling_params_len > 1 else sampling_params.logprobs
+
        # get output
-        outputs = self._run_engine(req_ids, use_tqdm=use_tqdm)
+        outputs = self._run_engine(req_ids, use_tqdm=use_tqdm, topk_logprobs=topk_logprobs)
        for i in range(len(outputs)):
            outputs[i].prompt = prompts[i]
        return outputs
@@ -223,8 +228,10 @@ class LLM:
            chat_template_kwargs=chat_template_kwargs,
        )

+        topk_logprobs = sampling_params[0].logprobs if sampling_params_len > 1 else sampling_params.logprobs
+
        # get output
-        outputs = self._run_engine(req_ids, use_tqdm=use_tqdm)
+        outputs = self._run_engine(req_ids, use_tqdm=use_tqdm, topk_logprobs=topk_logprobs)
        return outputs

    def _add_request(
@@ -278,7 +285,55 @@ class LLM:
            self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
        return req_ids

-    def _run_engine(self, req_ids: list[str], use_tqdm: bool):
+    def _decode_token(self, token_id: int) -> str:
+        """Decodes a single token ID into its string representation."""
+        return self.llm_engine.data_processor.process_logprob_response([token_id], clean_up_tokenization_spaces=False)
+
+    def _build_sample_logprobs(self, logprobs_lists: LogprobsLists, topk_logprobs: int) -> list[dict[int, Logprob]]:
+        """
+        Constructs a list of dictionaries mapping token IDs to Logprob objects,
+        based on sliced LogprobsLists data (excluding the sampled token at index 0).
+
+        Args:
+            logprobs_lists (LogprobsLists): Contains top-k token IDs, logprobs, and sampled ranks.
+            max_num (int): Maximum number of top logprobs to include (excluding sampled token at index 0).
+
+        Returns:
+            list[dict[int, Logprob]]: One dict per request, mapping token ID to Logprob.
+        """
+        try:
+            llm_logger.info(f"filter logprobs, topk_logprobs: {topk_logprobs}")
+            if not logprobs_lists.logprob_token_ids:
+                llm_logger.warning("Empty logprob_token_ids in LogprobsLists")
+                return None
+
+            # exclude sampled token at index 0
+            available_topk = len(logprobs_lists.logprob_token_ids[0]) - 1
+            effective_topk_logprobs = min(topk_logprobs, available_topk)
+
+            if effective_topk_logprobs <= 0:
+                llm_logger.warning(
+                    f"Invalid effective_topk_logprobs={effective_topk_logprobs}, "
+                    f"available_topk={available_topk}, topk_logprobs={topk_logprobs}; returning empty result."
+                )
+                return None
+
+            # sliced 1 ~ (1 + effective_topk_logprobs)
+            sliced_logprobs_lists = logprobs_lists.slice_columns(1, 1 + effective_topk_logprobs)
+            result = []
+            for token_ids, logprobs in zip(sliced_logprobs_lists.logprob_token_ids, sliced_logprobs_lists.logprobs):
+
+                logprob_dict = {
+                    token_id: Logprob(logprob=logprob, rank=i + 1, decoded_token=self._decode_token(token_id))
+                    for i, (token_id, logprob) in enumerate(zip(token_ids, logprobs))
+                }
+                result.append(logprob_dict)
+            return result
+
+        except Exception as e:
+            llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}")
+
+    def _run_engine(self, req_ids: list[str], use_tqdm: bool, topk_logprobs: Optional[int] = None):
        """
            运行引擎，并返回结果列表。

@@ -320,6 +375,13 @@ class LLM:

                    result = self.req_output.pop(req_id)
                    result = self.llm_engine.data_processor.process_response(result)
+
+                    # filter logprobs
+                    if result.outputs.top_logprobs and topk_logprobs:
+                        result.outputs.logprobs = self._build_sample_logprobs(
+                            result.outputs.top_logprobs, topk_logprobs
+                        )
+
                    output[pos] = result
                    finished.append(i)

--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -14,15 +14,17 @@
 # limitations under the License.
 """

+import asyncio
 import os
 import threading
 import time
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
 from multiprocessing import current_process

 import uvicorn
 import zmq
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from prometheus_client import CONTENT_TYPE_LATEST

@@ -45,9 +47,10 @@ from fastdeploy.metrics.metrics import (
    get_filtered_metrics,
    main_process_metrics,
 )
-from fastdeploy.metrics.trace_util import inject_to_metadata, instrument
+from fastdeploy.metrics.trace_util import fd_start_span, inject_to_metadata, instrument
 from fastdeploy.utils import (
    FlexibleArgumentParser,
+    StatefulSemaphore,
    api_server_logger,
    console_logger,
    is_port_available,
@@ -60,6 +63,13 @@ parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the htt
 parser.add_argument("--workers", default=1, type=int, help="number of workers")
 parser.add_argument("--metrics-port", default=8001, type=int, help="port for metrics server")
 parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server")
+parser.add_argument(
+    "--max-waiting-time",
+    default=-1,
+    type=int,
+    help="max waiting time for connection, if set value -1 means no waiting time limit",
+)
+parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
@@ -114,10 +124,12 @@ async def lifespan(app: FastAPI):
        args.enable_mm,
        args.reasoning_parser,
        args.data_parallel_size,
+        args.enable_logprob,
+        args.workers,
    )
    app.state.dynamic_load_weight = args.dynamic_load_weight
-    chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
-    completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips)
+    chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
+    completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
    engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
    engine_client.pid = pid
    app.state.engine_client = engine_client
@@ -139,6 +151,41 @@ app = FastAPI(lifespan=lifespan)
 instrument(app)


+MAX_CONCURRENT_CONNECTIONS = (args.max_concurrency + args.workers - 1) // args.workers
+connection_semaphore = StatefulSemaphore(MAX_CONCURRENT_CONNECTIONS)
+
+
+@asynccontextmanager
+async def connection_manager():
+    """
+    async context manager for connection manager
+    """
+    try:
+        await asyncio.wait_for(connection_semaphore.acquire(), timeout=0.001)
+        yield
+    except asyncio.TimeoutError:
+        api_server_logger.info(f"Reach max request release: {connection_semaphore.status()}")
+        raise HTTPException(
+            status_code=429, detail=f"Too many requests, current max concurrency is {args.max_concurrency}"
+        )
+
+
+def wrap_streaming_generator(original_generator: AsyncGenerator):
+    """
+    Wrap an async generator to release the connection semaphore when the generator is finished.
+    """
+
+    async def wrapped_generator():
+        try:
+            async for chunk in original_generator:
+                yield chunk
+        finally:
+            api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
+            connection_semaphore.release()
+
+    return wrapped_generator
+
+
 # TODO 传递真实引擎值 通过pid 获取状态
@app.get("/health")
 def health(request: Request) -> Response:
@@ -197,20 +244,30 @@ async def create_chat_completion(request: ChatCompletionRequest):
    """
    Create a chat completion for the provided prompt and parameters.
    """
+    api_server_logger.info(f"Chat Received request: {request.model_dump_json()}")
    if app.state.dynamic_load_weight:
        status, msg = app.state.engine_client.is_workers_alive()
        if not status:
            return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)
-    inject_to_metadata(request)
-    generator = await app.state.chat_handler.create_chat_completion(request)
+    try:
+        async with connection_manager():
+            inject_to_metadata(request)
+            generator = await app.state.chat_handler.create_chat_completion(request)
+            if isinstance(generator, ErrorResponse):
+                connection_semaphore.release()
+                api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
+                return JSONResponse(content={"detail": generator.model_dump()}, status_code=generator.code)
+            elif isinstance(generator, ChatCompletionResponse):
+                connection_semaphore.release()
+                api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
+                return JSONResponse(content=generator.model_dump())
+            else:
+                wrapped_generator = wrap_streaming_generator(generator)
+                return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")

-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(), status_code=generator.code)
-
-    elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
+    except HTTPException as e:
+        api_server_logger.error(f"Error in chat completion: {str(e)}")
+        return JSONResponse(status_code=e.status_code, content={"detail": e.detail})


@app.post("/v1/completions")
@@ -218,18 +275,26 @@ async def create_completion(request: CompletionRequest):
    """
    Create a completion for the provided prompt and parameters.
    """
+    api_server_logger.info(f"Completion Received request: {request.model_dump_json()}")
    if app.state.dynamic_load_weight:
        status, msg = app.state.engine_client.is_workers_alive()
        if not status:
            return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)

-    generator = await app.state.completion_handler.create_completion(request)
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(content=generator.model_dump(), status_code=generator.code)
-    elif isinstance(generator, CompletionResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
+    try:
+        async with connection_manager():
+            generator = await app.state.completion_handler.create_completion(request)
+            if isinstance(generator, ErrorResponse):
+                connection_semaphore.release()
+                return JSONResponse(content=generator.model_dump(), status_code=generator.code)
+            elif isinstance(generator, CompletionResponse):
+                connection_semaphore.release()
+                return JSONResponse(content=generator.model_dump())
+            else:
+                wrapped_generator = wrap_streaming_generator(generator)
+                return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
+    except HTTPException as e:
+        return JSONResponse(status_code=e.status_code, content={"detail": e.detail})


@app.get("/update_model_weight")
@@ -269,6 +334,7 @@ def launch_api_server() -> None:

    api_server_logger.info(f"launch Fastdeploy api server... port: {args.port}")
    api_server_logger.info(f"args: {args.__dict__}")
+    fd_start_span("FD_START")

    try:
        uvicorn.run(
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -18,7 +18,7 @@ from __future__ import annotations

 import json
 import time
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union

 from pydantic import BaseModel, Field, model_validator

@@ -126,6 +126,8 @@ class ChatMessage(BaseModel):
    tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
    prompt_token_ids: Optional[List[int]] = None
    completion_token_ids: Optional[List[int]] = None
+    text_after_process: Optional[str] = None
+    raw_prediction: Optional[str] = None


 class ChatCompletionResponseChoice(BaseModel):
@@ -183,6 +185,8 @@ class DeltaMessage(BaseModel):
    completion_token_ids: Optional[List[int]] = None
    reasoning_content: Optional[str] = None
    tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
+    text_after_process: Optional[str] = None
+    raw_prediction: Optional[str] = None


 class ChatCompletionResponseStreamChoice(BaseModel):
@@ -219,8 +223,10 @@ class CompletionResponseChoice(BaseModel):
    text: str
    prompt_token_ids: Optional[List[int]] = None
    completion_token_ids: Optional[List[int]] = None
+    text_after_process: Optional[str] = None
+    raw_prediction: Optional[str] = None
    arrival_time: Optional[float] = None
-    logprobs: Optional[int] = None
+    logprobs: Optional[CompletionLogprobs] = None
    reasoning_content: Optional[str] = None
    finish_reason: Optional[Literal["stop", "length", "tool_calls"]]
    tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
@@ -239,6 +245,17 @@ class CompletionResponse(BaseModel):
    usage: UsageInfo


+class CompletionLogprobs(BaseModel):
+    """
+    Completion logprobs.
+    """
+
+    tokens: Optional[List[str]] = None
+    token_logprobs: Optional[List[float]] = None
+    top_logprobs: Optional[List[Dict]] = None
+    text_offset: Optional[List[int]] = None
+
+
 class CompletionResponseStreamChoice(BaseModel):
    """
    Completion response choice for stream response.
@@ -247,9 +264,11 @@ class CompletionResponseStreamChoice(BaseModel):
    index: int
    text: str
    arrival_time: float = None
+    logprobs: Optional[CompletionLogprobs] = None
    prompt_token_ids: Optional[List[int]] = None
    completion_token_ids: Optional[List[int]] = None
-    logprobs: Optional[float] = None
+    text_after_process: Optional[str] = None
+    raw_prediction: Optional[str] = None
    reasoning_content: Optional[str] = None
    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
    tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
@@ -343,24 +362,29 @@ class CompletionRequest(BaseModel):
    suffix: Optional[dict] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
+    user: Optional[str] = None
+
+    # doc: begin-completion-sampling-params
    top_k: Optional[int] = None
    min_p: Optional[float] = None
-    user: Optional[str] = None
-    extra_body: Optional[dict] = None
-    return_token_ids: Optional[bool] = False
-    prompt_token_ids: Optional[List[int]] = None
+    repetition_penalty: Optional[float] = None
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    min_tokens: Optional[int] = None
+    include_stop_str_in_output: Optional[bool] = False
+    bad_words: Optional[List[str]] = None
+    # doc: end-completion-sampling-params

+    # doc: start-completion-extra-params
    response_format: Optional[AnyResponseFormat] = None
    guided_json: Optional[Union[str, dict, BaseModel]] = None
    guided_regex: Optional[str] = None
    guided_choice: Optional[list[str]] = None
    guided_grammar: Optional[str] = None

-    # doc: begin-completion-sampling-params
-    repetition_penalty: Optional[float] = None
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-
-    # doc: end-completion-sampling-params
+    max_streaming_response_tokens: Optional[int] = None
+    return_token_ids: Optional[bool] = None
+    prompt_token_ids: Optional[List[int]] = None
+    # doc: end-completion-extra-params

    def to_dict_for_infer(self, request_id=None, prompt=None):
        """
@@ -373,16 +397,13 @@ class CompletionRequest(BaseModel):
        if request_id is not None:
            req_dict["request_id"] = request_id

-        # parse request model into dict, priority: request > extra_body > suffix
+        # parse request model into dict
+        if self.suffix is not None:
+            for key, value in self.suffix.items():
+                req_dict[key] = value
        for key, value in self.dict().items():
            if value is not None:
                req_dict[key] = value
-        if self.extra_body is not None:
-            for key, value in self.extra_body.items():
-                req_dict.setdefault(key, value)
-        if self.suffix is not None:
-            for key, value in self.suffix.items():
-                req_dict.setdefault(key, value)

        if prompt is not None:
            req_dict["prompt"] = prompt
@@ -476,26 +497,34 @@ class ChatCompletionRequest(BaseModel):
    stream_options: Optional[StreamOptions] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
    user: Optional[str] = None
    metadata: Optional[dict] = None
-    extra_body: Optional[dict] = None
-    return_token_ids: Optional[bool] = False
-    prompt_token_ids: Optional[List[int]] = None
-
    response_format: Optional[AnyResponseFormat] = None
+
+    # doc: begin-chat-completion-sampling-params
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    min_tokens: Optional[int] = None
+    include_stop_str_in_output: Optional[bool] = False
+    bad_words: Optional[List[str]] = None
+    repetition_penalty: Optional[float] = None
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    # doc: end-chat-completion-sampling-params
+
+    # doc: start-completion-extra-params
+    chat_template_kwargs: Optional[dict] = None
+    reasoning_max_tokens: Optional[int] = None
+    structural_tag: Optional[str] = None
    guided_json: Optional[Union[str, dict, BaseModel]] = None
    guided_regex: Optional[str] = None
    guided_choice: Optional[list[str]] = None
    guided_grammar: Optional[str] = None
-    structural_tag: Optional[str] = None

-    # doc: begin-chat-completion-sampling-params
-    repetition_penalty: Optional[float] = None
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
-
-    # doc: end-chat-completion-sampling-params
+    return_token_ids: Optional[bool] = None
+    prompt_token_ids: Optional[List[int]] = None
+    max_streaming_response_tokens: Optional[int] = None
+    disable_chat_template: Optional[bool] = False
+    # doc: end-chat-completion-extra-params

    def to_dict_for_infer(self, request_id=None):
        """
@@ -511,19 +540,16 @@ class ChatCompletionRequest(BaseModel):
        req_dict["max_tokens"] = self.max_completion_tokens or self.max_tokens
        req_dict["logprobs"] = self.top_logprobs if self.logprobs else None

-        # parse request model into dict, priority: request > extra_body > metadata
-        for key, value in self.dict().items():
-            if value is not None:
-                req_dict[key] = value
-        if self.extra_body is not None:
-            for key, value in self.extra_body.items():
-                req_dict.setdefault(key, value)
+        # parse request model into dict, priority: request params > metadata params
        if self.metadata is not None:
            assert (
                "raw_request" not in self.metadata
            ), "The parameter `raw_request` is not supported now, please use completion api instead."
            for key, value in self.metadata.items():
-                req_dict.setdefault(key, value)
+                req_dict[key] = value
+        for key, value in self.dict().items():
+            if value is not None:
+                req_dict[key] = value

        if "prompt_token_ids" in req_dict:
            if "messages" in req_dict:
@@ -531,6 +557,11 @@ class ChatCompletionRequest(BaseModel):
        else:
            assert len(self.messages) > 0

+        # If disable_chat_template is set, then the first message in messages will be used as the prompt.
+        if self.disable_chat_template:
+            req_dict["prompt"] = req_dict["messages"][0]["content"]
+            del req_dict["messages"]
+
        guided_json_object = None
        if self.response_format is not None:
            if self.response_format.type == "json_object":
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -49,10 +49,11 @@ class OpenAIServingChat:
    OpenAI-style chat completions serving
    """

-    def __init__(self, engine_client, pid, ips):
+    def __init__(self, engine_client, pid, ips, max_waiting_time):
        self.engine_client = engine_client
        self.pid = pid
        self.master_ip = ips
+        self.max_waiting_time = max_waiting_time
        self.host_ip = get_host_ip()
        if self.master_ip is not None:
            if isinstance(self.master_ip, list):
@@ -76,31 +77,46 @@ class OpenAIServingChat:
            err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
            api_server_logger.error(err_msg)
            return ErrorResponse(message=err_msg, code=400)
-        if request.user is not None:
-            request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
-        else:
-            request_id = f"chatcmpl-{uuid.uuid4()}"
-        api_server_logger.info(f"create chat completion request: {request_id}")

        try:
-            current_req_dict = request.to_dict_for_infer(request_id)
-            current_req_dict["arrival_time"] = time.time()
-            prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
-            if isinstance(prompt_token_ids, np.ndarray):
-                prompt_token_ids = prompt_token_ids.tolist()
-        except Exception as e:
-            return ErrorResponse(code=400, message=str(e))
+            if self.max_waiting_time < 0:
+                await self.engine_client.semaphore.acquire()
+            else:
+                await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
+            api_server_logger.debug(f"current waiting request {self.engine_client.semaphore.status()}")

-        del current_req_dict
-
-        if request.stream:
-            return self.chat_completion_stream_generator(request, request_id, request.model, prompt_token_ids)
-        else:
+            if request.user is not None:
+                request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
+            else:
+                request_id = f"chatcmpl-{uuid.uuid4()}"
+            api_server_logger.info(f"create chat completion request: {request_id}")
+            text_after_process = None
            try:
-                return await self.chat_completion_full_generator(request, request_id, request.model, prompt_token_ids)
+                current_req_dict = request.to_dict_for_infer(request_id)
+                current_req_dict["arrival_time"] = time.time()
+                prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
+                text_after_process = current_req_dict.get("text_after_process")
+                if isinstance(prompt_token_ids, np.ndarray):
+                    prompt_token_ids = prompt_token_ids.tolist()
            except Exception as e:
                return ErrorResponse(code=400, message=str(e))

+            del current_req_dict
+
+            if request.stream:
+                return self.chat_completion_stream_generator(
+                    request, request_id, request.model, prompt_token_ids, text_after_process
+                )
+            else:
+                try:
+                    return await self.chat_completion_full_generator(
+                        request, request_id, request.model, prompt_token_ids, text_after_process
+                    )
+                except Exception as e:
+                    return ErrorResponse(code=400, message=str(e))
+        except Exception:
+            return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
+
    def _create_streaming_error_response(self, message: str) -> str:
        error_response = ErrorResponse(
            code=400,
@@ -114,6 +130,7 @@ class OpenAIServingChat:
        request_id: str,
        model_name: str,
        prompt_token_ids: list(),
+        text_after_process: str,
    ):
        """
        Streaming chat completion generator.
@@ -124,11 +141,17 @@ class OpenAIServingChat:
        previous_num_tokens = 0
        num_prompt_tokens = 0
        num_choices = 1
-        max_streaming_response_tokens = 1
-        enable_thinking = None
-        include_stop_str_in_output = False
-        if request.metadata is not None and request.metadata.get("max_streaming_response_tokens", 1) > 1:
-            max_streaming_response_tokens = request.metadata["max_streaming_response_tokens"]
+        max_streaming_response_tokens = (
+            request.max_streaming_response_tokens
+            if request.max_streaming_response_tokens is not None
+            else (request.metadata or {}).get("max_streaming_response_tokens", 1)
+        )  # dierctly passed & passed in metadata
+
+        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
+        if enable_thinking is None:
+            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
+
+        include_stop_str_in_output = request.include_stop_str_in_output

        stream_options = request.stream_options
        if stream_options is None:
@@ -149,12 +172,6 @@ class OpenAIServingChat:
            dealer.write([b"", request_id.encode("utf-8")])
            choices = []
            current_waiting_time = 0
-            if request.metadata is not None:
-                enable_thinking = request.metadata.get("enable_thinking")
-                include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
-            enable_return_token_ids = request.return_token_ids or (
-                request.extra_body is not None and request.extra_body.get("return_token_ids", False)
-            )
            while num_choices > 0:
                try:
                    raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
@@ -204,8 +221,9 @@ class OpenAIServingChat:
                                    completion_token_ids=None,
                                ),
                            )
-                            if enable_return_token_ids:
+                            if request.return_token_ids:
                                choice.delta.prompt_token_ids = list(prompt_token_ids)
+                                choice.delta.text_after_process = text_after_process
                            chunk = ChatCompletionStreamResponse(
                                id=request_id,
                                object=chunk_object_type,
@@ -221,22 +239,16 @@ class OpenAIServingChat:
                                    prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
                                )
                            yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
+                            api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
                        first_iteration = False

                    output = res["outputs"]
                    delta_text = output["text"]
-                    raw_top_logprobs = output["top_logprobs"]
-                    logprobs_res = None
-                    if raw_top_logprobs is not None:
-                        top_logprobs = LogprobsLists(
-                            logprob_token_ids=raw_top_logprobs[0],
-                            logprobs=raw_top_logprobs[1],
-                            sampled_token_ranks=raw_top_logprobs[2],
-                        )
-                        logprobs_res = self.build_logprobs_response(
-                            request_logprobs=request.logprobs,
-                            response_logprobs=top_logprobs,
-                            request_top_logprobs=request.top_logprobs,
+                    output_top_logprobs = output["top_logprobs"]
+                    logprobs_res: Optional[LogProbs] = None
+                    if request.logprobs and output_top_logprobs is not None:
+                        logprobs_res = self._create_chat_logprobs(
+                            output_top_logprobs, request.logprobs, request.top_logprobs
                        )

                    previous_num_tokens += len(output["token_ids"])
@@ -254,6 +266,7 @@ class OpenAIServingChat:
                        logprobs=logprobs_res,
                        arrival_time=arrival_time,
                    )
+
                    if res["finished"]:
                        num_choices -= 1
                        work_process_metrics.e2e_request_latency.observe(
@@ -274,8 +287,9 @@ class OpenAIServingChat:
                        if res.get("error_msg") is not None and "Recover" in res["error_msg"]:
                            choice.finish_reason = "recover_stop"

-                    if enable_return_token_ids:
+                    if request.return_token_ids:
                        choice.delta.completion_token_ids = list(output["token_ids"])
+                        choice.delta.raw_prediction = output.get("raw_prediction")
                    if include_continuous_usage:
                        chunk.usage = UsageInfo(
                            prompt_tokens=num_prompt_tokens,
@@ -287,6 +301,9 @@ class OpenAIServingChat:
                    if len(choices) == max_streaming_response_tokens or res["finished"]:
                        chunk.choices = choices
                        yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
+                        # 打印尾包
+                        if res["finished"]:
+                            api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
                        choices = []

                if choices:
@@ -316,6 +333,8 @@ class OpenAIServingChat:
            yield f"data: {error_data}\n\n"
        finally:
            dealer.close()
+            self.engine_client.semaphore.release()
+            api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
            yield "data: [DONE]\n\n"

    async def chat_completion_full_generator(
@@ -324,17 +343,19 @@ class OpenAIServingChat:
        request_id: str,
        model_name: str,
        prompt_token_ids: list(),
+        text_after_process: str,
    ):
        """
        Full chat completion generator.
        """
        created_time = int(time.time())
        final_res = None
-        enable_thinking = None
-        include_stop_str_in_output = False
-        enable_return_token_ids = request.return_token_ids or (
-            request.extra_body is not None and request.extra_body.get("return_token_ids", False)
-        )
+        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
+        if enable_thinking is None:
+            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
+
+        include_stop_str_in_output = request.include_stop_str_in_output
+
        try:
            dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
            dealer.write([b"", request_id.encode("utf-8")])
@@ -363,9 +384,6 @@ class OpenAIServingChat:
                for data in response:
                    if data.get("error_code", 200) != 200:
                        raise ValueError("{}".format(data["error_msg"]))
-                    if request.metadata is not None:
-                        enable_thinking = request.metadata.get("enable_thinking")
-                        include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
                    data = self.engine_client.data_processor.process_response_dict(
                        data,
                        stream=False,
@@ -377,17 +395,10 @@ class OpenAIServingChat:
                    completion_token_ids.extend(data["outputs"]["token_ids"])
                    # The logprob for handling the response
                    output = data["outputs"]
-                    raw_top_logprobs = output["top_logprobs"]
-                    if raw_top_logprobs is not None:
-                        top_logprobs = LogprobsLists(
-                            logprob_token_ids=raw_top_logprobs[0],
-                            logprobs=raw_top_logprobs[1],
-                            sampled_token_ranks=raw_top_logprobs[2],
-                        )
-                        logprobs_res = self.build_logprobs_response(
-                            request_logprobs=request.logprobs,
-                            response_logprobs=top_logprobs,
-                            request_top_logprobs=request.top_logprobs,
+                    output_top_logprobs = output["top_logprobs"]
+                    if output_top_logprobs is not None:
+                        logprobs_res = self._create_chat_logprobs(
+                            output_top_logprobs, request.logprobs, request.top_logprobs
                        )
                        if logprobs_res and logprobs_res.content is not None:
                            logprob_contents.extend(logprobs_res.content)
@@ -399,6 +410,8 @@ class OpenAIServingChat:
                    break
        finally:
            dealer.close()
+            self.engine_client.semaphore.release()
+            api_server_logger.info(f"release {self.engine_client.semaphore.status()}")

        choices = []
        output = final_res["outputs"]
@@ -407,8 +420,10 @@ class OpenAIServingChat:
            content=output["text"],
            reasoning_content=output.get("reasoning_content"),
            tool_calls=output.get("tool_call_content"),
-            prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
-            completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
+            prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
+            completion_token_ids=completion_token_ids if request.return_token_ids else None,
+            text_after_process=text_after_process if request.return_token_ids else None,
+            raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
        )
        logprobs_full_res = None
        if logprob_contents:
@@ -442,15 +457,46 @@ class OpenAIServingChat:
            prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)),
        )
        work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"])
-        return ChatCompletionResponse(
+        res = ChatCompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
        )
+        api_server_logger.info(f"Chat response: {res.model_dump_json()}")
+        return res

-    def build_logprobs_response(
+    def _create_chat_logprobs(
+        self,
+        output_top_logprobs,
+        request_logprobs: Optional[bool] = None,
+        request_top_logprobs: Optional[int] = None,
+    ) -> Optional[LogProbs]:
+        """Create OpenAI-style logprobs for chat completions."""
+        if output_top_logprobs is None or len(output_top_logprobs) < 3 or any(not lst for lst in output_top_logprobs):
+            return None
+        logprobs_res: Optional[LogProbs] = None
+        for logprob_token_ids, logprobs, sampled_token_ranks in zip(
+            output_top_logprobs[0], output_top_logprobs[1], output_top_logprobs[2]
+        ):
+            top_logprobs = LogprobsLists(
+                logprob_token_ids=[logprob_token_ids],
+                logprobs=[logprobs],
+                sampled_token_ranks=[sampled_token_ranks],
+            )
+            step_logprobs_res = self._build_logprobs_response(
+                request_logprobs=request_logprobs,
+                response_logprobs=top_logprobs,
+                request_top_logprobs=request_top_logprobs,
+            )
+            if logprobs_res is None:
+                logprobs_res = step_logprobs_res
+            else:
+                logprobs_res.content.extend(step_logprobs_res.content)
+        return logprobs_res
+
+    def _build_logprobs_response(
        self,
        request_logprobs: bool,
        response_logprobs: Optional[LogprobsLists],
@@ -487,12 +533,10 @@ class OpenAIServingChat:
                token_str = self.engine_client.data_processor.process_logprob_response(
                    [tid], clean_up_tokenization_spaces=False
                )
-                # token_bytes = token_str.encode("utf-8", errors="replace")
-                entry = LogProbEntry(
-                    token=token_str,
-                    logprob=lp,
-                    # bytes=list(token_bytes)
-                )
+                token_bytes = token_str.encode("utf-8", errors="replace")
+                if "\ufffd" in token_str:
+                    token_str = "bytes:" + "".join(f"\\x{byte:02x}" for byte in token_bytes)
+                entry = LogProbEntry(token=token_str, logprob=lp, bytes=list(token_bytes))
                top_logprob_entries.append(entry)
            # Construct the sampled token object (avoid sharing references with top_logprob_entries)
            sampled_entry = LogProbEntry(
@@ -505,6 +549,6 @@ class OpenAIServingChat:
            return LogProbs(content=[sampled_entry])

        except Exception as e:
-            api_server_logger.error("Error in build_logprobs_response: %s", e)
+            api_server_logger.error("Error in _build_logprobs_response: %s", e)
            api_server_logger.error(traceback.format_exc())
            return None
--- a/Show More
+++ b/Show More