Compare commits

..

23 Commits

Author SHA1 Message Date
Jiang-Jia-Jun
e421d51001 [Feature] Support include_stop_str_in_output (#2919)
Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-07-18 19:43:19 +08:00
sg263
c71d955e9c [Trace]fix opentelemetry can not work in uvicorn (#2907)
* add opentelemetry

* add opentelemetry

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* fix opentelemetry-instrumentation-fastapi

* fix annotation

* fix opentelemetry-bootstrap

* fix opentelemetry-bootstrap

* fix opentelemetry can not work in uvicorn

* remove unless import

* move conf to env

* fix useless commit

---------

Co-authored-by: shige <shige@baidu.com>
2025-07-17 23:16:29 +08:00
gaoziyuan
2d2468ae72 fix config get (#2883) 2025-07-17 15:03:26 +08:00
sg263
7deac64233 [Bug Fix] fix opentelemetry-bootstra (#2875)
* add opentelemetry

* add opentelemetry

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* fix opentelemetry-instrumentation-fastapi

* fix annotation

* fix opentelemetry-bootstrap

* fix opentelemetry-bootstrap

---------

Co-authored-by: shige <shige@baidu.com>
2025-07-17 00:51:02 +08:00
sg263
5a5f17cf97 fix put opentelemetry-instrumentation-fastapi in requierment (#2874)
* add opentelemetry

* add opentelemetry

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* fix opentelemetry-instrumentation-fastapi

* fix annotation

---------

Co-authored-by: shige <shige@baidu.com>
2025-07-17 00:41:53 +08:00
sg263
0d61c65de1 [Trace] Support trace log (#2864)
* add opentelemetry

* add opentelemetry

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* add opentelemetry on dequeue
2025-07-16 15:35:44 +08:00
Jiang-Jia-Jun
e5de28bff2 Update setup.py 2025-07-15 10:11:26 +08:00
AIbin
b9eede57b6 cp PR#2820 to release/2.0.2 (#2839) 2025-07-14 17:05:56 +08:00
lddfym
94e1a895e3 fix spelling error (#2826)
* fix spelling error

* fix scheduler reset error
2025-07-14 13:13:08 +08:00
zhenwenDang
87203ec87b After enabling "top_logprobs supports passing 0 and fix max_completion_tokens", an incorrect finish_reason was returned. (#2815)
* /v1/chat/completions endpoint now supports max_completion_tokens and fixes the return value of finish_reason

* top_logprobs supports passing 0
2025-07-11 16:53:12 +08:00
Sunny-bot1
4596dd7248 [FIX 2.0.2]fix topp topk default value (#2810)
* fix topp topk default value

* update topk
2025-07-11 16:12:02 +08:00
lddfym
ec986642df Global scheduler supports configuring hot updates (#2812) 2025-07-11 13:39:30 +08:00
chen
94691bcd90 fix enable_logprob not in rl_config (#2808) 2025-07-11 11:52:48 +08:00
Sunny-bot1
4025ea7e5b [FIX 2.0.2] Topk topp sampling fix (#2805)
* fix topk-topp

* fix
2025-07-10 06:15:03 -07:00
lizexu123
e681e1e719 [BugFix] fix RMSNorm rms_norm_esp (#2804) 2025-07-10 05:39:02 -07:00
chen
823a47e64a [Feature] Support return logprob of generated tokens (#2784)
* online chat support logprobs

* check xpu

* check vl_gpu_model_runner

* only cuda support logprob

* get_worker() check platform

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-07-10 15:47:42 +08:00
gaoziyuan
39d2a1de46 fix num_blocks_local when small size model in TP2 running mode (#2793) 2025-07-10 13:44:56 +08:00
Sunny-bot1
1107e08cd9 [Feature 2.0.2] support top_k_top_p sampling (#2789)
* support top_k_top_p sampling

* fix

* add api param

* add api para

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* change func name
2025-07-09 21:01:51 -07:00
Jiang-Jia-Jun
1fe37cb7e8 [BugFix] Fix vocab size error for ernie model 2025-07-09 22:33:04 +08:00
gaoziyuan
337d76f094 [sync fix] (#2759)
* add rl qwen model support

* fix

* fix

* add_commit_config

* fix
2025-07-08 19:29:23 +08:00
gaoziyuan
ae2f78184d 【Sync develop】 add commit info (#2755)
* add rl qwen model support

* fix

* fix

* add_commit_config
2025-07-08 17:02:50 +08:00
gaoziyuan
6851489425 【Sync】Release/2.0.1 (#2745)
* add rl qwen model support

* fix

* fix
2025-07-08 14:38:18 +08:00
Jiang-Jia-Jun
ea787d8f62 fix bug. (#2718) (#2720)
Co-authored-by: Ting <wtmlon@foxmail.com>
2025-07-05 09:00:01 +08:00
645 changed files with 20838 additions and 46710 deletions

View File

@@ -1,7 +0,0 @@
[flake8]
ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
max-line-length = 119
# E402: module level import not at top of file
per-file-ignores =
__init__.py:F401,F403,E402

View File

@@ -1,50 +0,0 @@
name: Codestyle-Check
on:
pull_request:
branches:
- develop
- 'release/*'
jobs:
pre-commit:
name: Pre Commit
if: ${{ github.repository_owner == 'PaddlePaddle' }}
runs-on: ubuntu-latest
env:
PR_ID: ${{ github.event.pull_request.number }}
BRANCH: ${{ github.event.pull_request.base.ref }}
steps:
- name: Cleanup
run: |
rm -rf * .[^.]*
- name: Checkout base repo
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.base.ref }}
fetch-depth: 1000
- name: Merge PR to test branch
run: |
git fetch origin pull/${PR_ID}/merge
git checkout -b test FETCH_HEAD
- name: Setup python3.10
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Install dependencies
run: |
pip install pre-commit==4.2.0 cpplint==1.6.0 clang-format==13.0.0
- name: Check pre-commit
env:
SKIP_CLANG_TIDY_CHECK: "ON"
run: |
set +e
bash -x tools/codestyle/pre_commit.sh;EXCODE=$?
exit $EXCODE

View File

@@ -1,173 +0,0 @@
name: FastDeploy Linux GPU Build Task
description: "FastDeploy packages build and upload"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
COMPILE_ARCH:
description: "Build GPU Archs"
required: true
type: string
default: "80,90"
WITH_NIGHTLY_BUILD:
description: "Enable nightly build mode (e.g. add date suffix to version)"
required: false
type: string
default: "ON"
FD_VERSION:
description: "FastDeploy Package Version"
required: false
type: string
default: ""
UPLOAD:
description: "Upload Package"
required: false
type: string
default: "ON"
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
outputs:
wheel_path:
description: "Output path of the generated wheel"
value: ${{ jobs.fd-build.outputs.wheel_path }}
jobs:
fd-build:
runs-on: [self-hosted, GPU-Build]
outputs:
wheel_path: ${{ steps.set_output.outputs.wheel_path }}
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
IS_PR: ${{ github.event_name == 'pull_request' }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: FastDeploy Build
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
compile_arch: ${{ inputs.COMPILE_ARCH }}
fd_version: ${{ inputs.FD_VERSION }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
run: |
set -x
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/.ccache:/root/.ccache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "COMPILE_ARCH=${compile_arch}" \
-e "FD_VERSION=${fd_version}" \
-e "WITH_NIGHTLY_BUILD=${WITH_NIGHTLY_BUILD}" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
echo "Custom FastDeploy version: ${FASTDEPLOY_VERSION}"
fi
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
if [[ "${WITH_NIGHTLY_BUILD}" == "ON" ]];then
GIT_COMMIT_TIME=$(git --no-pager show -s --format=%ci HEAD)
DATE_ONLY=$(echo $GIT_COMMIT_TIME | sed "s/ .*//;s/-//g")
echo "Git Commit Time: $GIT_COMMIT_TIME"
echo "Date Only: $DATE_ONLY"
export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
fi
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install wheel
# 编译RDMA
export ENABLE_FD_RDMA=1
bash build.sh 1 python false [${COMPILE_ARCH}]
ls ./dist/*.whl
'
- name: Package Upload
id: set_output
env:
compile_arch: ${{ inputs.COMPILE_ARCH }}
run: |
set -x
if [[ "${{ github.event_name }}" == "pull_request" ]];then
commit_id=${{ github.event.pull_request.head.sha }}
pr_num=${{ github.event.pull_request.number }}
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
elif [[ "${{ github.ref_type }}" == "tag" ]]; then
commit_id=${{ github.sha }}
tag_name=${{ github.ref_name }}
target_path=paddle-github-action/TAG/FastDeploy/${tag_name}/${commit_id}/SM${compile_arch//,/_}
else
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}/SM${compile_arch//,/_}
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python --version
python -m pip install bce-python-sdk==0.9.29
cd FastDeploy/dist/
matches=($(ls fastdeploy*.whl))
if [ ${#matches[@]} -ne 1 ]; then
echo "Error: Found ${#matches[@]} matching files, expected exactly 1"
exit 1
fi
fd_wheel_name=${matches[0]}
echo "Found: $fd_wheel_name"
tree -L 3
python ${push_file} fastdeploy*.whl ${target_path}
target_path_stripped="${target_path#paddle-github-action/}"
WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT

View File

@@ -1,78 +0,0 @@
name: FastDeploy Code Clone
description: "FastDeploy clone and upload"
on:
workflow_call:
inputs:
bos_dir:
type: string
required: false
default: 'FastDeploy'
outputs:
repo_archive_url:
description: "Compressed source code archive."
value: ${{ jobs.code-clone.outputs.repo_archive_url }}
jobs:
code-clone:
runs-on:
group: HK-Clone
outputs:
repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
steps:
- name: Clone FastDeploy
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request'
&& github.event.pull_request.base.ref
|| github.ref_name }}
submodules: 'recursive'
fetch-depth: 1000
- name: Merge PR (if needed)
if: ${{ github.event_name == 'pull_request' }}
run: |
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
echo "Fetching and merging PR..."
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
git merge --no-ff pr/${{ github.event.pull_request.number }}
echo "PR Branch log "
git log --oneline -n 5 pr/${{ github.event.pull_request.number }}
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Code Info Show and Upload
id: set_output
env:
AK: paddle
SK: paddle
run: |
git config --unset http.https://github.com/.extraheader
git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
echo "Current HEAD Log:"
git log --oneline -n 5
ls
cd ..
tar -zcf FastDeploy.tar.gz FastDeploy
if [[ "${{ github.event_name }}" == "pull_request" ]];then
commit_id=${{ github.event.pull_request.head.sha }}
pr_num=${{ github.event.pull_request.number }}
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}
elif [[ "${{ github.ref_type }}" == "tag" ]]; then
commit_id=${{ github.sha }}
tag_name=${{ github.ref_name }}
target_path=paddle-github-action/TAG/FastDeploy/${tag_name}/${commit_id}
else
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} FastDeploy.tar.gz ${target_path}
target_path_stripped="${target_path#paddle-github-action/}"
REPO_ARCHIVE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/FastDeploy.tar.gz
echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT

View File

@@ -1,169 +0,0 @@
name: Run FastDeploy LogProb Tests
description: "Run FastDeploy LogProb Tests"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
PADDLETEST_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
default: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
run_tests_logprob:
runs-on: [self-hosted, GPU-h20-1Cards]
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
paddletest_archive_url: ${{ inputs.PADDLETEST_ARCHIVE_URL }}
run: |
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
rm -rf /workspace/*
'
wget -q ${paddletest_archive_url}
tar -xf PaddleTest.tar.gz
rm -rf PaddleTest.tar.gz
cd PaddleTest
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: logprob test
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
exit 1
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
docker run --ipc=host --pid=host --net=host \
-v $(pwd):/workspace \
-w /workspace \
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
chmod +x ./llm-deploy-linux-amd64
./llm-deploy-linux-amd64 -python python3.10 \
-model_name ERNIE-4.5-0.3B-Paddle \
-model_path /MODELDATA \
--skip install
cd PaddleTest/framework/ServeTest
python3.10 deploy.py > dd.log 2>&1 &
sleep 3
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
set +e
rm -rf ./baseline_output
cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
LOGPROB_EXIT_CODE=0
python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env
curl -X POST http://localhost:${FLASK_PORT}/stop
sleep 10s
cat *result.log
exit 0
'
if [ $? -ne 0 ];then
exit 1
fi
if [ -f exit_code.env ]; then
cat exit_code.env >> $GITHUB_ENV
fi
- name: logprob test result
if: ${{ env.LOGPROB_EXIT_CODE != 0 }}
shell: bash
run: |
echo "logprob test failed with exit code ${{ env.LOGPROB_EXIT_CODE }}"
exit 8

View File

@@ -1,138 +0,0 @@
name: Pre-CE-Test
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
concurrency:
group: ${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
run_ce_cases:
runs-on: [self-hosted, PRE_CE_RUN_2Card]
steps:
- name: Print current runner name
run: |
echo "Current runner name: ${{ runner.name }}"
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run CI unittest
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "fd_wheel_url=${fd_wheel_url}" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install ${fd_wheel_url}
bash scripts/run_pre_ce.sh
'

View File

@@ -1,274 +0,0 @@
name: Run FastDeploy Unit Tests and Coverage
description: "Run FastDeploy Unit Tests and Coverage"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
run_tests_with_coverage:
runs-on: [self-hosted, GPU-h1z1-2Cards]
outputs:
diff_cov_file_url: ${{ steps.cov_upload.outputs.diff_cov_file_url }}
unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }}
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run FastDeploy Unit Tests and Coverage
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
BASE_REF: ${{ github.event.pull_request.base.ref }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
set -x
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
docker run --rm --net=host \
--cap-add=SYS_PTRACE --shm-size=64G \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e TZ="Asia/Shanghai" \
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install coverage
python -m pip install diff-cover
python -m pip install ${fd_wheel_url}
if [ -d "test/plugins" ]; then
cd test/plugins
python setup.py install
cd ../..
else
echo "Warning: test/plugins directory not found, skipping setup.py install"
fi
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
TEST_EXIT_CODE=0
bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
coverage combine coveragedata/
coverage xml -o python_coverage_all.xml
COVERAGE_EXIT_CODE=0
diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9
echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml
'
if [ -f FastDeploy/exit_code.env ]; then
cat FastDeploy/exit_code.env >> $GITHUB_ENV
fi
- name: Upload unit resule and diff coverage to bos
id: cov_upload
shell: bash
run: |
cd FastDeploy
commit_id=${{ github.event.pull_request.head.sha }}
pr_num=${{ github.event.pull_request.number }}
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
diff_cov_file="diff_coverage.xml"
if [ -f ${diff_cov_file} ];then
python ${push_file} ${diff_cov_file} ${target_path}/CoverageData
target_path_stripped="${target_path#paddle-github-action/}"
DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_file}
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_OUTPUT
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_ENV
fi
diff_cov_result_json="diff_coverage.json"
if [ -f ${diff_cov_result_json} ];then
python ${push_file} ${diff_cov_result_json} ${target_path}/CoverageData
target_path_stripped="${target_path#paddle-github-action/}"
DIFF_COV_JSON_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_result_json}
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_OUTPUT
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_ENV
fi
unittest_result="test/failed_tests.log"
if [ -s ${unittest_result} ];then
python ${push_file} ${unittest_result} ${target_path}/UnitTestResult
target_path_stripped="${target_path#paddle-github-action/}"
UNIT_TEST_RESULT_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_result}
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_OUTPUT
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV
fi
- name: Check Unit Test Success
shell: bash
run: |
cd FastDeploy
if [ "$TEST_EXIT_CODE" -eq 8 ]; then
filename=$(basename "$unittest_failed_url")
if [ -z "${unittest_failed_url}" ]; then
echo "No diff unit failed file URL provided."
else
rm -rf "${filename}"
wget -O ${filename} ${unittest_failed_url} || echo "Download unittest file failed, but continuing..."
fi
echo "Unit tests failed (exit code 8)"
if [ -f "${filename}" ];then
echo "Failed test cases:"
cat "${filename}"
fi
exit "$TEST_EXIT_CODE"
fi
echo "All tests passed"
- name: Verify Code Coverage Threshold (80%)
shell: bash
run: |
cd FastDeploy
if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then
echo "Coverage generation failed (exit code 9)"
filename=$(basename "$diff_cov_result_json_url")
if [ -z "${diff_cov_result_json_url}" ]; then
echo "No diff cov result file URL provided."
else
rm -rf "${filename}"
wget -O ${filename} ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..."
fi
if [ -f "${filename}" ];then
echo "Failed test cases:"
if command -v jq >/dev/null 2>&1; then
jq . "${filename}"
else
cat "${filename}"
fi
fi
exit "$COVERAGE_EXIT_CODE"
fi
echo "coverage passed"
exit 0
diff_coverage_report:
needs: run_tests_with_coverage
if: always()
runs-on: ubuntu-latest
steps:
- name: coverage diff file download
shell: bash
env:
diff_cov_file_url: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url }}
run: |
if [ -z "${diff_cov_file_url}" ]; then
echo "No diff coverage file URL provided."
exit 0
fi
wget "${diff_cov_file_url}" -O ./diff_coverage.xml || echo "Download cov file failed, but continuing..."
- name: Upload diff coverage report
if: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url != null && needs.run_tests_with_coverage.outputs.diff_cov_file_url != '' }}
uses: codecov/codecov-action@v5
with:
files: ./diff_coverage.xml
name: python diff coverage
verbose: true

View File

@@ -1,39 +0,0 @@
name: Approval
on:
pull_request:
branches:
- develop
- 'release/*'
jobs:
Approval:
name: Approval
if: ${{ github.repository_owner == 'PaddlePaddle' }}
runs-on: ubuntu-latest
env:
PR_ID: ${{ github.event.pull_request.number }}
BRANCH: ${{ github.event.pull_request.base.ref }}
steps:
- name: Checkout base repo
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.base.ref }}
fetch-depth: 1000
- name: Merge PR to test branch
run: |
git fetch origin pull/${PR_ID}/merge
git checkout -b test FETCH_HEAD
git log -n 3 --oneline
git remote add upstream https://github.com/PaddlePaddle/FastDeploy.git
git fetch upstream $BRANCH
- name: Setup python3.10
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Run approval check script
run: |
bash scripts/check_approval.sh

View File

@@ -1,4 +1,4 @@
name: CI_ILUVATAR
name: CI
on:
pull_request:
@@ -6,12 +6,12 @@ on:
workflow_dispatch:
concurrency:
group: ${{ github.event.pull_request.number }}-iluvatar-ci
group: ${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
CI_ILUVATAR:
runs-on: [self-hosted, IXUCA]
build:
runs-on: [self-hosted, GPU-L20-4Card]
steps:
- name: Print current runner name
run: |
@@ -22,7 +22,7 @@ jobs:
- name: Code Checkout
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
run: |
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
@@ -51,7 +51,7 @@ jobs:
- name: Run CI unittest
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
@@ -59,7 +59,7 @@ jobs:
if [[ "$last_char" =~ [0-3] ]]; then
gpu_id="$last_char"
else
gpu_id="0"
gpu_id="0"
fi
FD_API_PORT=$((9180 + gpu_id * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
@@ -67,18 +67,17 @@ jobs:
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host --pid=host --cap-add=ALL --privileged --shm-size=64G \
-v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev \
-v $(pwd):/workspace -w /workspace \
-v "/data1/fastdeploy:/data1/fastdeploy" \
-e "MODEL_PATH=/ssd3/model" \
-e "http_proxy=$(git config --global --get http.proxy)" \
-e "https_proxy=$(git config --global --get https.proxy)" \
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-v "/ssd4/GithubActions/gitconfig:/etc/gitconfig:ro" \
-v "/ssd4/GithubActions/ModelData:/ModelData:ro" \
-v "/ssd4/GithubActions/CacheDir:/root/.cache" \
-v "/ssd4/GithubActions/ConfigDir:/root/.config" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
${docker_image} /bin/bash -c "
--gpus device=${gpu_id} ${docker_image} /bin/bash -c "
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci_iluvatar.sh
"
bash scripts/run_ci.sh
"

View File

@@ -1,89 +0,0 @@
name: CI_GCU
on:
pull_request:
branches:
- develop
- 'release/*'
workflow_dispatch:
concurrency:
group: ${{ github.event.pull_request.number }}-gcu-ci
cancel-in-progress: true
jobs:
CI_GCU:
runs-on: [self-hosted, GCU-S60-8Card]
steps:
- name: Print current runner name
run: |
echo "Current runner name: ${{ runner.name }}"
- name: Code Checkout
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-gcu:topsrider3.5.102-ubuntu20-x86_64-gcc84
run: |
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}
fi
'
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
git merge pr/${{ github.event.pull_request.number }}
git log -n 3 --oneline
else
git checkout ${{ github.sha }}
git log -n 3 --oneline
fi
- name: Run CI unittest
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-gcu:topsrider3.5.102-ubuntu20-x86_64-gcc84
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
if [[ "$last_char" =~ [0-3] ]]; then
gcu_id="$last_char"
else
gcu_id="0"
fi
FD_API_PORT=$((9180 + gcu_id * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + gcu_id * 100))
FD_METRICS_PORT=$((9170 + gcu_id * 100))
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
echo "Install drivers..."
cd /work/deps
bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
cd -
docker run --rm --network=host --ipc=host -it --privileged \
-v $(pwd):/workspace -w /workspace \
-v "/home:/home" \
-v "/work:/work" \
-e "MODEL_PATH=/work/models" \
-e "http_proxy=$(git config --global --get http.proxy)" \
-e "https_proxy=$(git config --global --get https.proxy)" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
${docker_image} /bin/bash -c "
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci_gcu.sh
"

View File

@@ -2,9 +2,7 @@ name: CI_XPU
on:
pull_request:
branches:
- develop
- 'release/*'
branches: [ develop ]
workflow_dispatch:
concurrency:
@@ -12,7 +10,7 @@ concurrency:
cancel-in-progress: true
jobs:
CI_XPU:
build:
runs-on: [self-hosted, XPU-P800-8Card]
steps:
- name: Print current runner name
@@ -29,11 +27,9 @@ jobs:
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
@@ -42,7 +38,7 @@ jobs:
'
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
git clone ${REPO} ${REPO_NAME}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
@@ -63,7 +59,7 @@ jobs:
if [[ "$last_char" =~ [0-3] ]]; then
gpu_id="$last_char"
else
gpu_id="0"
gpu_id="0"
fi
FD_API_PORT=$((9180 + gpu_id * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
@@ -84,4 +80,4 @@ jobs:
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci_xpu.sh
"
"

View File

@@ -1,65 +0,0 @@
name: PR Build and Test
on:
pull_request:
types: [opened, synchronize]
branches: [develop, release/**]
permissions: read-all
concurrency:
group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
cancel-in-progress: true
jobs:
clone:
name: FD-Clone-Linux
uses: ./.github/workflows/_clone_linux.yml
build:
name: FD-Build-Linux
needs: clone
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "89,90"
WITH_NIGHTLY_BUILD: "OFF"
FD_VERSION: "0.0.0"
resultshow:
name: Use Build Output
needs: build
runs-on: ubuntu-latest
steps:
- name: Print wheel path
run: |
echo "The built wheel is located at: ${{ needs.build.outputs.wheel_path }}"
unittest_coverage:
name: Run FastDeploy Unit Tests and Coverage
needs: [clone,build]
uses: ./.github/workflows/_unit_test_coverage.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
logprob_test:
name: Run FastDeploy LogProb Tests
needs: [build]
uses: ./.github/workflows/_logprob_test_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
pre_ce_test:
name: Extracted partial CE model tasks to run in CI.
needs: [clone,build]
uses: ./.github/workflows/_pre_ce_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"

2
.gitignore vendored
View File

@@ -162,5 +162,3 @@ custom_ops/tmp*
build
.ccls-cache
third_party

View File

@@ -3,30 +3,14 @@ default_install_hook_types:
- commit-msg
default_stages:
- pre-commit # Run locally
- commit-msg
# - manual # Run in CI
repos:
- repo: https://github.com/psf/black.git
rev: 25.1.0
hooks:
- id: black
files: \.(py|pyi)$
additional_dependencies: [toml]
# 自动排序
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
hooks:
- id: flake8
# 代码检查
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix, --line-length=120, --config, pyproject.toml]
args: [--output-format, github, --fix, --line-length=120]
# # 拼写检查
# - repo: https://github.com/codespell-project/codespell
# rev: v2.4.1
@@ -34,13 +18,17 @@ repos:
# - id: codespell
# additional_dependencies: ['tomli']
# args: ['--toml', 'pyproject.toml']
# 自动排序
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
- id: isort
# markdown
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
hooks:
- id: pymarkdown
args: ["-d", "MD029,MD031", fix]
args: [fix]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:

View File

@@ -8,17 +8,14 @@
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/FastDeploy?color=9cc"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
</p>
<p align="center">
<a href="https://trendshift.io/repositories/4046" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4046" alt="PaddlePaddle%2FFastDeploy | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></br>
<a href="https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/"><b> Installation </b></a>
|
<a href="https://paddlepaddle.github.io/FastDeploy/get_started/quick_start"><b> Quick Start </b></a>
|
<a href="https://paddlepaddle.github.io/FastDeploy/supported_models/"><b> Supported Models </b></a>
</p>
--------------------------------------------------------------------------------
@@ -26,10 +23,6 @@
## News
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
**[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
**[2025-06] 🔥 Released FastDeploy v2.0:** Supports inference and deployment for ERNIE 4.5. Furthermore, we open-source an industrial-grade PD disaggregation with context caching, dynamic role switching for effective resource utilization to further enhance inference performance for MoE models.
## About

View File

@@ -41,10 +41,7 @@ python -m pip install -r requirements.txt
--metric-percentiles 80,95,99,99.9,99.95,99.99:性能结果中展示的性能指标分位值
--num-prompts 1总计发送多少条请求
--max-concurrency 1压测并发数
--save-result开启结果保存结果文件会存入json默认False不保存
--debug开启debug模式逐条打印payload和output内容默认False
--shuffle是否打乱数据集默认False不打乱
--seed打乱数据集时的随机种子默认0
--save-result开启结果保存结果文件会存入json
```
##### /v1/chat/completions接口压测单条数据调试
@@ -108,30 +105,3 @@ python benchmark_serving.py \
--save-result > infer_log.txt 2>&1 &
```
### 投机解码性能测试工具
#### 使用方式:
```bash
python benchmarks/benchmark_mtp.py \
--host 127.0.0.1 --port 8000 \
--max-concurrency 16 32 64 96 --num-prompts 256 \
--acceptance-rate 0.8 --draft-token-steps 1 2 3 \
--s_itl-base-model 15.88 22.84 16.47 16.93 \
--dataset-name EBChat \
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json
```
#### 参数说明
```bash
--host服务ip地址用于组url
--port服务HTTP端口用于组url
--max-concurrency测试并发数
--num-prompts总计发送多少条请求
--acceptance-rate投机解码的模拟接受率
--draft-token-steps投机解码的步数
--s_itl-base-model主模型的解码延迟可由上述的性能压测工具获得与batch-size一一对应
--dataset-name指定数据集类指定为"EBChat"可读取转存的FD格式数据集
--dataset-path测试数据集路径
```

View File

@@ -29,14 +29,13 @@ from typing import Optional
import aiohttp
from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
"""Input for requesting LLMs via API"""
no: int
prompt: str
history_QA: Optional[dict]
hyper_parameters: dict
@@ -50,14 +49,11 @@ class RequestFuncInput:
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
language: Optional[str] = None
debug: bool = False
@dataclass
class RequestFuncOutput:
"""Output for requesting LLMs via API"""
no: int = 0
generated_text: str = ""
reasoning_content: str = ""
success: bool = False
@@ -68,7 +64,7 @@ class RequestFuncOutput:
itl: list = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
prompt_tokens: int = 0 # 推理侧返回输入token数
prompt_tokens: int = 0 # 推理侧返回输入token数
error: str = ""
@@ -78,19 +74,22 @@ async def async_request_eb_openai_chat_completions(
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'."
assert api_url.endswith(
("completions", "profile")
), "OpenAI Chat Completions API URL must end with 'completions'."
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": request_func_input.model,
"model": "default",
"messages": request_func_input.history_QA,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
@@ -98,10 +97,6 @@ async def async_request_eb_openai_chat_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.debug:
print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@@ -109,20 +104,21 @@ async def async_request_eb_openai_chat_completions(
output = RequestFuncOutput()
output.prompt_len = 0
output.no = request_func_input.no
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload, headers=headers) as response:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
@@ -136,20 +132,21 @@ async def async_request_eb_openai_chat_completions(
ttft = timestamp - st
output.ttft = ttft
# cached_tokens
output.prompt_len = (
data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
)
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
output.itl.append(timestamp -
most_recent_timestamp)
output.generated_text += content or ""
output.reasoning_content += reason_content or ""
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
elif usage := data.get("usage", {}):
output.output_tokens = usage.get("completion_tokens", 0)
output.prompt_tokens = usage.get("prompt_tokens", 0)
output.arrival_time.append(choices[0].get("arrival_time"))
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
output.prompt_tokens = usage.get(
"prompt_tokens")
most_recent_timestamp = timestamp
@@ -162,12 +159,7 @@ async def async_request_eb_openai_chat_completions(
output.latency = most_recent_timestamp - st
else:
error_text = await response.text()
print(
"####error response:",
error_text,
"####payload:",
payload,
)
print("####error response:", error_text, "####payload:", payload)
output.error = error_text or ""
output.success = False
except Exception:
@@ -181,8 +173,6 @@ async def async_request_eb_openai_chat_completions(
f.write(str(output) + "\n")
if pbar:
pbar.update(1)
if request_func_input.debug:
print("#####final_output:", output)
return output
@@ -196,14 +186,15 @@ async def async_request_eb_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model,
"model": "default",
"prompt": request_func_input.prompt,
"stream": True,
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True,
"continuous_usage_stats": True
},
}
# 超参由yaml传入
@@ -211,25 +202,19 @@ async def async_request_eb_openai_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.debug:
print("payload:", json.dumps(payload, ensure_ascii=False))
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
output.no = request_func_input.no
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload, headers=headers) as response:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
@@ -237,10 +222,10 @@ async def async_request_eb_openai_completions(
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, chunk.usage)
timestamp = time.perf_counter()
data = json.loads(chunk)
# NOTE: Some completion API might have a last
@@ -250,40 +235,35 @@ async def async_request_eb_openai_completions(
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = timestamp - st
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
generated_text += text or ""
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
output.arrival_time.append(choices[0].get("arrival_time"))
generated_text += text or ""
elif usage := data.get("usage"):
output.prompt_tokens = usage.get("prompt_tokens")
output.output_tokens = usage.get("completion_tokens")
output.prompt_tokens = usage.get(
"prompt_tokens")
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
)
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
if output.generated_text == "":
output.success = False
output.error = "No generated text found!"
else:
output.success = True
else:
output.error = response.reason or ""
output.success = False
@@ -292,9 +272,6 @@ async def async_request_eb_openai_completions(
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if request_func_input.debug:
print(f"final_output:{output}")
if pbar:
pbar.update(1)
return output
@@ -308,7 +285,8 @@ async def async_request_tgi(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
@@ -355,7 +333,8 @@ async def async_request_tgi(
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(data["arrival_time"])
@@ -384,7 +363,8 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
@@ -409,7 +389,8 @@ async def async_request_trt_llm(
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
@@ -421,7 +402,8 @@ async def async_request_trt_llm(
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
@@ -446,7 +428,8 @@ async def async_request_deepspeed_mii(
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using Deepspeed MII"""
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
@@ -464,16 +447,19 @@ async def async_request_deepspeed_mii(
st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url, json=payload) as response:
async with session.post(url=request_func_input.api_url,
json=payload) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0]["text"]
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = "Unexpected response format: " "neither 'choices' nor 'text' found"
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True
else:
@@ -499,22 +485,26 @@ async def async_request_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
# "temperature": 0.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
# "stream_options": {
#"stream_options": {
# "include_usage": True,
# },
#},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -523,7 +513,8 @@ async def async_request_openai_completions(
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload, headers=headers) as response:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
@@ -531,7 +522,8 @@ async def async_request_openai_completions(
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
data = json.loads(chunk)
@@ -552,19 +544,21 @@ async def async_request_openai_completions(
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
)
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
@@ -587,24 +581,25 @@ async def async_request_openai_audio(
"""Request an LLM using OpenAI"""
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(
("transcriptions", "translations")
), "OpenAI Chat Completions API URL must end with 'transcriptions' "
("transcriptions", "translations"
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
"model": request_func_input.model_name \
if request_func_input.model_name else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
"stream_continuous_usage_stats": True
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
@@ -619,9 +614,9 @@ async def async_request_openai_audio(
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
form.add_field('file', f, content_type='audio/wav')
for key, value in payload.items():
form.add_field(key, str(value))
@@ -633,20 +628,24 @@ async def async_request_openai_audio(
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, data=form, headers=headers) as response:
async with session.post(url=api_url,
data=form,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
@@ -654,11 +653,13 @@ async def async_request_openai_audio(
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
@@ -692,11 +693,8 @@ ASYNC_REQUEST_FUNCS = {
}
OPENAI_COMPATIBLE_BACKENDS = [
k
for k, v in ASYNC_REQUEST_FUNCS.items()
if v
in (
async_request_openai_completions,
async_request_eb_openai_chat_completions,
)
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_eb_openai_chat_completions)
]

View File

@@ -26,10 +26,10 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Optional, Union
from typing import Any, Callable, Optional, Union
from PIL import Image
logger = logging.getLogger(__name__)
@@ -39,7 +39,6 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""
no: int
prompt: Union[str, Any]
history_QA: Union[str, Any]
json_data: Optional[dict]
@@ -49,7 +48,6 @@ class SampleRequest:
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
DEFAULT_SEED = 0
IS_MULTIMODAL = False
@@ -57,7 +55,6 @@ class BenchmarkDataset(ABC):
self,
dataset_path: Optional[str] = None,
random_seed: int = DEFAULT_SEED,
shuffle: bool = False,
hyperparameter_path: Optional[str] = None,
) -> None:
"""
@@ -71,9 +68,9 @@ class BenchmarkDataset(ABC):
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
self.random_seed = (random_seed
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
self.shuffle = shuffle
self.hyperparameter_path = hyperparameter_path
self.hyperparameters = {}
@@ -88,7 +85,8 @@ class BenchmarkDataset(ABC):
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise NotImplementedError("load_data must be implemented in subclasses.")
raise NotImplementedError(
"load_data must be implemented in subclasses.")
@abstractmethod
def sample(self, num_requests: int) -> list[SampleRequest]:
@@ -107,7 +105,8 @@ class BenchmarkDataset(ABC):
"""
raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
def maybe_oversample_requests(self, requests: list[SampleRequest],
num_requests: int) -> None:
"""
Oversamples the list of requests if its size is less than the desired
number.
@@ -118,9 +117,11 @@ class BenchmarkDataset(ABC):
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
additional = random.choices(requests, k=num_requests - len(requests))
additional = random.choices(requests,
k=num_requests - len(requests))
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.", num_requests)
logger.info("Oversampled requests to reach %d total samples.",
num_requests)
def is_valid_sequence(
@@ -140,12 +141,14 @@ def is_valid_sequence(
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
output_too_short = (not skip_min_output_len_check) and (output_len
< min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)
return not (prompt_too_short or output_too_short or prompt_too_long
or combined_too_long)
def process_image(image: Any) -> Mapping[str, Any]:
@@ -168,25 +171,28 @@ def process_image(image: Any) -> Mapping[str, Any]:
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(image, dict) and "bytes" in image:
image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
if isinstance(image, str):
image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
image_url = (image if image.startswith(
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
)
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes.")
class EBDataset(BenchmarkDataset):
@@ -213,10 +219,6 @@ class EBDataset(BenchmarkDataset):
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
if self.shuffle:
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
num_requests: int,
@@ -227,7 +229,6 @@ class EBDataset(BenchmarkDataset):
**kwargs,
) -> list:
samples: list = []
cnt = 1
for entry in self.data:
if len(samples) >= num_requests:
break
@@ -241,17 +242,15 @@ class EBDataset(BenchmarkDataset):
new_output_len = int(entry["max_dec_len"])
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
no=cnt,
prompt=prompt,
prompt_len=self.prompt_len,
history_QA=[],
expected_output_len=new_output_len,
)
)
cnt += 1
))
self.maybe_oversample_requests(samples, num_requests)
return samples
@@ -262,7 +261,6 @@ class EBChatDataset(BenchmarkDataset):
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
prompt_len: int
def __init__(self, **kwargs) -> None:
@@ -276,10 +274,6 @@ class EBChatDataset(BenchmarkDataset):
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
if self.shuffle:
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
num_requests: int,
@@ -290,7 +284,6 @@ class EBChatDataset(BenchmarkDataset):
**kwargs,
) -> list:
samples: list = []
cnt = 1
for entry in self.data:
if len(samples) >= num_requests:
break
@@ -300,18 +293,17 @@ class EBChatDataset(BenchmarkDataset):
new_output_len = int(entry.get("max_tokens", 12288))
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
no=cnt,
json_data=json_data,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
)
)
cnt += 1
))
self.maybe_oversample_requests(samples, num_requests)
return samples

View File

@@ -1,178 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import argparse
import asyncio
import contextlib
import os
from typing import Union
from benchmark_dataset import EBChatDataset, EBDataset
from benchmark_serving import benchmark
def prepare_input_requests(num_prompts: int, dataset_name: str, dataset_path: str) -> Union[EBDataset, EBChatDataset]:
dataset_mapping = {
"EB": lambda: EBDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
"EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
}
try:
input_requests = dataset_mapping[dataset_name]()
except KeyError as err:
raise ValueError(f"Unknown dataset: {dataset_name}") from err
return input_requests
class FakeTokenizer:
def encode(self, text: str, add_special_tokens: bool = False):
return []
def send_one_batch(base_url, max_concurrency, input_requests, disable_tqdm):
selected_percentile_metrics = ["s_itl"]
selected_percentiles = []
# Run benchmark
results = asyncio.run(
benchmark(
backend="openai-chat",
api_url=f"{base_url}/v1/chat/completions",
base_url=base_url,
model_id="default",
model_name="default",
input_requests=input_requests,
hyper_parameters={},
logprobs=None,
request_rate=float("inf"),
burstiness=1.0,
disable_tqdm=disable_tqdm,
profile=False,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
ignore_eos=False,
goodput_config_dict=None,
max_concurrency=max_concurrency,
lora_modules=None,
extra_body=None,
)
)
record = {
"mean_s_itl_ms": results["mean_s_itl_ms"],
}
return record
def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp):
tmp = 0.0
for i in range(draft_token_step):
tmp += pow(acceptance_rate, i + 1)
r_ac = tmp / (1 + tmp)
return t_ori / ((1 - r_ac) * t_mtp)
def main(args):
base_url = f"http://{args.host}:{args.port}"
input_requests = prepare_input_requests(args.num_prompts, args.dataset_name, args.dataset_path)
if len(args.max_concurrency) != len(args.s_itl_base_model):
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
# Wramup
print("Starting warmup...")
with open(os.devnull, "w") as f:
with contextlib.redirect_stdout(f):
send_one_batch(
base_url,
max_concurrency,
input_requests[0:max_concurrency],
True,
)
# Benchmark
record = send_one_batch(base_url, max_concurrency, input_requests, False)
metric_header = "Speed up"
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
for draft_token_step in args.draft_token_steps:
speedup = calculate_speedup(
args.acceptance_rate,
draft_token_step,
s_itl,
record["mean_s_itl_ms"],
)
print("{:<40} {:<10.2f}".format(f"Speed up on {draft_token_step} steps draft", speedup))
print("=" * 50)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--host",
type=str,
default="127.0.0.1",
)
parser.add_argument(
"--port",
type=str,
default="8000",
)
parser.add_argument(
"--max-concurrency",
type=int,
nargs="+",
default=(1, 2, 4, 8, 16, 32),
)
parser.add_argument(
"--num-prompts",
type=int,
default=128,
)
parser.add_argument(
"--acceptance-rate",
type=float,
default=0.8,
)
parser.add_argument(
"--draft-token-steps",
type=int,
nargs="+",
default=(1, 2),
)
parser.add_argument(
"--s_itl-base-model",
type=float,
nargs="+",
)
parser.add_argument(
"--dataset-name",
type=str,
default="EBChat",
)
parser.add_argument(
"--dataset-path",
type=str,
)
args = parser.parse_args()
main(args)

File diff suppressed because it is too large Load Diff

View File

@@ -24,11 +24,9 @@ import os
from typing import Any
def convert_to_pytorch_benchmark_format(
args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any],
) -> list:
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
@@ -56,10 +54,12 @@ def convert_to_pytorch_benchmark_format(
},
}
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"]
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
@@ -68,7 +68,6 @@ def convert_to_pytorch_benchmark_format(
class InfEncoder(json.JSONEncoder):
"""InfEncoder"""
def clear_inf(self, o: Any):
"""clear_inf"""
if isinstance(o, dict):
@@ -88,3 +87,4 @@ def write_to_json(filename: str, records: list) -> None:
"""write_to_json"""
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)

View File

@@ -25,32 +25,32 @@ import os
import random
import time
import warnings
from argparse import ArgumentParser as FlexibleArgumentParser
import yaml
import requests
import copy
from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
import numpy as np
import requests
import yaml
from backend_request_func import (
ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS,
RequestFuncInput,
RequestFuncOutput,
)
from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from backend_request_func import (ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm
from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@dataclass
class BenchmarkMetrics:
"""Class containing all metrics that are used in this script"""
completed: int
total_input: int
total_output: int
@@ -133,7 +133,8 @@ async def get_request(
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}."
assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}.")
theta = 1.0 / (request_rate * burstiness)
for request in input_requests:
@@ -159,7 +160,7 @@ def calculate_metrics(
) -> tuple[BenchmarkMetrics, list[int]]:
"""Calculates various performance metrics based on the inputs and outputs."""
input_lens: list[int] = []
infer_input_lens: list[int] = [] # 推理侧输入token数
infer_input_lens: list[int] = [] # 推理侧输入token数
actual_output_lens: list[int] = []
total_input = 0
completed = 0
@@ -209,9 +210,8 @@ def calculate_metrics(
s_e2els.append(outputs[i].arrival_time[-1])
# 解码速度去掉首token
if len(outputs[i].arrival_time) > 2:
s_decodes.append(
(outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])
)
s_decodes.append((outputs[i].output_tokens - 1) /
(outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]))
completed += 1
else:
actual_output_lens.append(0)
@@ -224,13 +224,16 @@ def calculate_metrics(
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(goodput_config_dict["ttft"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(goodput_config_dict["tpot"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(goodput_config_dict["e2el"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -239,9 +242,9 @@ def calculate_metrics(
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
stacklevel=2,
)
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -250,50 +253,64 @@ def calculate_metrics(
request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend
mean_s_decode=np.mean(s_decodes or 0) *
1, # ttfts is empty if streaming is not supported by backend
std_s_decode=np.std(s_decodes or 0) * 1,
median_s_decode=np.median(s_decodes or 0) * 1,
percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles],
mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1)
for p in selected_percentiles],
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
for p in selected_percentiles],
mean_s_ttft_ms=np.mean(s_ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
std_s_ttft_ms=np.std(s_ttfts or 0) * 1000,
median_s_ttft_ms=np.median(s_ttfts or 0) * 1000,
percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles],
percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000)
for p in selected_percentiles],
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
for p in selected_percentiles],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
mean_s_itl_ms=np.mean(s_itls or 0) * 1000,
std_s_itl_ms=np.std(s_itls or 0) * 1000,
median_s_itl_ms=np.median(s_itls or 0) * 1000,
percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles],
percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000)
for p in selected_percentiles],
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000,
std_s_e2el_ms=np.std(s_e2els or 0) * 1000,
median_s_e2el_ms=np.median(s_e2els or 0) * 1000,
percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles],
percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000)
for p in selected_percentiles],
mean_input_len=np.mean(input_lens or 0) * 1,
std_input_len=np.std(input_lens or 0) * 1,
median_input_len=np.median(input_lens or 0) * 1,
percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles],
percentiles_input_len=[(p, np.percentile(input_lens or 0, p))
for p in selected_percentiles],
mean_s_input_len=np.mean(infer_input_lens or 0) * 1,
std_s_input_len=np.std(infer_input_lens or 0) * 1,
median_s_input_len=np.median(infer_input_lens or 0) * 1,
percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles],
percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p))
for p in selected_percentiles],
mean_output_len=np.mean(actual_output_lens or 0) * 1,
std_output_len=np.std(actual_output_lens or 0) * 1,
median_output_len=np.median(actual_output_lens or 0) * 1,
percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles],
percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p))
for p in selected_percentiles],
)
return metrics, actual_output_lens
@@ -334,22 +351,20 @@ async def benchmark(
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])
lora_modules = iter(
[random.choice(lora_modules) \
for _ in range(len(input_requests))])
if profile:
print("Starting profiler...")
test_prompt = None
test_output_len = None
profile_input = RequestFuncInput(
model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
output_len=test_output_len,
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body,
)
profile_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
output_len=test_output_len,
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
@@ -369,16 +384,19 @@ async def benchmark(
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
semaphore = (asyncio.Semaphore(max_concurrency)
if max_concurrency else None)
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
return await request_func(request_func_input=request_func_input,
pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar)
return await request_func(request_func_input=request_func_input,
pbar=pbar)
benchmark_start_time = time.perf_counter()
print(f"开始时间:{datetime.now()}")
tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
@@ -391,26 +409,25 @@ async def benchmark(
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
request_func_input = RequestFuncInput(
model=req_model_id,
model_name=req_model_name,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
hyper_parameters=hyper_parameters,
api_url=api_url,
output_len=output_len,
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body,
)
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
request_func_input = RequestFuncInput(model=req_model_id,
model_name=req_model_name,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
hyper_parameters=hyper_parameters,
api_url=api_url,
output_len=output_len,
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
print(f"完成时间:{datetime.now()}")
if profile:
print("Stopping profiler...")
test_output_len = None
test_output_len = None
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
@@ -437,16 +454,22 @@ async def benchmark(
)
print("Benchmark complete!!!")
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput))
print("{:<40} {:<10}".format("Total generated tokens:",
metrics.total_output))
print("{:<40} {:<10.3f}".format("Request throughput (req/s):",
metrics.request_throughput))
if goodput_config_dict:
print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
metrics.total_token_throughput))
result = {
"duration": benchmark_duration,
@@ -454,7 +477,8 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
"request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
"request_goodput:":
metrics.request_goodput if goodput_config_dict else None,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -467,6 +491,7 @@ async def benchmark(
"reasoning_contents": [output.reasoning_content for output in outputs],
"errors": [output.error for output in outputs],
}
quick_result = copy.deepcopy(result)
def process_one_metric(
# E.g., "ttft"
@@ -480,25 +505,24 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
print(
"{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
)
)
print(
"{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms"),
)
)
result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print("{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
print("{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms")))
result[f"mean_{metric_attribute_name}_ms"] = getattr(
metrics, f"mean_{metric_attribute_name}_ms")
result[f"median_{metric_attribute_name}_ms"] = getattr(
metrics, f"median_{metric_attribute_name}_ms")
result[f"std_{metric_attribute_name}_ms"] = getattr(
metrics, f"std_{metric_attribute_name}_ms")
for p, value in getattr(metrics,
f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
def process_one_length(
@@ -513,31 +537,31 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
print(
"{:<40} {:<10.2f}".format(
f"Mean {metric_name}:",
getattr(metrics, f"mean_{metric_attribute_name}"),
)
)
print(
"{:<40} {:<10.2f}".format(
f"Median {metric_name}:",
getattr(metrics, f"median_{metric_attribute_name}"),
)
)
result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print("{:<40} {:<10.2f}".format(
f"Mean {metric_name}:",
getattr(metrics, f"mean_{metric_attribute_name}")))
print("{:<40} {:<10.2f}".format(
f"Median {metric_name}:",
getattr(metrics, f"median_{metric_attribute_name}")))
result[f"mean_{metric_attribute_name}"] = getattr(
metrics, f"mean_{metric_attribute_name}")
result[f"median_{metric_attribute_name}"] = getattr(
metrics, f"median_{metric_attribute_name}")
result[f"std_{metric_attribute_name}"] = getattr(
metrics, f"std_{metric_attribute_name}")
for p, value in getattr(metrics,
f"percentiles_{metric_attribute_name}"):
p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
value))
result[f"p{p_word}_{metric_attribute_name}"] = value
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -557,7 +581,6 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
"""
快速评估
"""
def process_quick_metric(
metric_attribute_name: str,
metric_name: str,
@@ -565,7 +588,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
):
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
mean_value = getattr(metrics, f"mean_{metric_attribute_name}_ms")
print("{:<40} {:<10.2f}".format(f"Mean {metric_name} (ms):", mean_value))
quick_result[f"mean_{metric_attribute_name}_ms"] = mean_value
@@ -577,17 +600,17 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
):
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
mean_value = getattr(metrics, f"mean_{metric_attribute_name}")
print("{:<40} {:<10.2f}".format(f"Mean {metric_name}:", mean_value))
quick_result[f"mean_{metric_attribute_name}"] = mean_value
print("\n\n\n")
print("{s:{c}^{n}}".format(s=" Benchmark Quick Summary ", n=50, c="="))
print("{s:{c}^{n}}".format(s=' Benchmark Quick Summary ', n=50, c='='))
process_quick_length("s_decode", "Decode", "解码速度(tok/s)")
process_quick_metric("ttft", "TTFT", "Time to First Token")
process_quick_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
process_quick_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_quick_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_quick_metric("itl", "ITL", "Inter-token Latency")
process_quick_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_quick_metric("e2el", "E2EL", "End-to-end Latency")
@@ -610,14 +633,12 @@ def check_goodput_args(args):
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
f"{VALID_NAMES!s}. "
)
f"{str(VALID_NAMES)}. ")
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
"non-negative."
)
"non-negative.")
return goodput_config_dict
@@ -631,43 +652,37 @@ def parse_goodput(slo_pairs):
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
'Specify service level objectives for goodput as "KEY:VALUE" '
"Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds."
) from err
"number in milliseconds.") from err
return goodput_config_dict
def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None:
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any],
file_name: str) -> None:
"""Save the benchmarking results to PyTorch Benchmark Format JSON file"""
metrics = [
"median_ttft_ms",
"mean_ttft_ms",
"std_ttft_ms",
"p99_ttft_ms",
"mean_tpot_ms",
"median_tpot_ms",
"std_tpot_ms",
"p99_tpot_ms",
"median_itl_ms",
"mean_itl_ms",
"std_itl_ms",
"p99_itl_ms",
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]] for k in metrics},
extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics},
)
metrics={k: [results[k]]
for k in metrics},
extra_info={
k: results[k]
for k in results if k not in metrics and k not in ignored_metrics
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def check_health(api_base_url: str) -> bool:
health_url = api_base_url.rstrip("/") + "/health"
try:
@@ -682,7 +697,6 @@ def check_health(api_base_url: str) -> bool:
print(f"[HEALTH] Failed to connect to {health_url}: {e}")
return False
def main(args: argparse.Namespace):
"""Main entry point"""
print(args)
@@ -693,6 +707,7 @@ def main(args: argparse.Namespace):
model_id = args.model
model_name = args.served_model_name
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
tokenizer_mode = args.tokenizer_mode
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
@@ -702,17 +717,23 @@ def main(args: argparse.Namespace):
base_url = f"http://{args.host}:{args.port}"
if args.dataset_name is None:
raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.")
raise ValueError(
"Please specify '--dataset-name' and the corresponding "
"'--dataset-path' if required.")
# For datasets that follow a similar structure, use a mapping.
dataset_mapping = {
"EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
"EB":
lambda: EBDataset(random_seed=args.seed,
dataset_path=args.dataset_path).sample(
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
),
"EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
"EBChat":
lambda: EBChatDataset(random_seed=args.seed,
dataset_path=args.dataset_path).sample(
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
),
}
@@ -730,14 +751,15 @@ def main(args: argparse.Namespace):
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
"temperature": args.temperature,
}.items()
if v is not None
"temperature": args.temperature
}.items() if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.")
raise ValueError(
"Sampling parameters are only supported by openai-compatible "
"backends.")
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
@@ -768,14 +790,15 @@ def main(args: argparse.Namespace):
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",")
],
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_body=sampling_params,
)
)
))
# Save config and results to json
if args.save_result:
@@ -796,23 +819,22 @@ def main(args: argparse.Namespace):
kvstring = item.split("=")
result_json[kvstring[0].strip()] = kvstring[1].strip()
else:
raise ValueError("Invalid metadata format. Please use KEY=VALUE format.")
raise ValueError(
"Invalid metadata format. Please use KEY=VALUE format."
)
if not args.save_detailed:
# Remove fields with too many data points
for field in [
"input_lens",
"output_lens",
"ttfts",
"itls",
"generated_texts",
"errors",
"input_lens", "output_lens", "ttfts", "itls",
"generated_texts", "errors"
]:
if field in result_json:
del result_json[field]
# Traffic
result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf"
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@@ -821,19 +843,21 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else ""
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
if args.max_concurrency is not None else "")
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w", encoding="utf-8") as outfile:
with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.")
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
type=str,
@@ -859,29 +883,18 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
choices=[
"sharegpt",
"burstgpt",
"sonnet",
"random",
"hf",
"EB",
"EBChat",
],
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument(
"--dataset-path",
type=str,
default=None,
help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
)
parser.add_argument(
"--hyperparameter-path",
type=str,
default=None,
help="Path to the hyperparameter. ",
)
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.")
parser.add_argument("--hyperparameter-path",
type=str,
default=None,
help="Path to the hyperparameter. ")
parser.add_argument(
"--max-concurrency",
type=int,
@@ -893,8 +906,7 @@ if __name__ == "__main__":
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up.",
)
"if the server is not processing requests fast enough to keep up.")
parser.add_argument(
"--model",
@@ -905,7 +917,7 @@ if __name__ == "__main__":
parser.add_argument(
"--tokenizer",
type=str,
help="Name or path of the tokenizer, if not using the default tokenizer.",
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
@@ -918,13 +930,11 @@ if __name__ == "__main__":
"--logprobs",
type=int,
default=None,
help=(
"Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
help=("Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"),
)
parser.add_argument(
"--request-rate",
@@ -961,7 +971,8 @@ if __name__ == "__main__":
parser.add_argument(
"--profile",
action="store_true",
help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.",
help="Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--save-result",
@@ -1002,38 +1013,35 @@ if __name__ == "__main__":
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
)
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
)
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
'Default value is "99". '
'Use "--percentile-metrics" to select metrics.',
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
parser.add_argument(
"--goodput",
nargs="+",
required=False,
help='Specify service level objectives for goodput as "KEY:VALUE" '
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is in "
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
'"ttft", "tpot", "e2el". For more context on the definition of '
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
)
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -1061,8 +1069,8 @@ if __name__ == "__main__":
"--sharegpt-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
)
help="Output length for each request. Overrides the output length "
"from the ShareGPT dataset.")
random_group = parser.add_argument_group("random dataset options")
random_group.add_argument(
@@ -1090,24 +1098,29 @@ if __name__ == "__main__":
"--random-prefix-len",
type=int,
default=0,
help=(
"Number of fixed prefix tokens before the random context "
"in a request. "
"The total input length is the sum of `random-prefix-len` and "
"a random "
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."
),
help=("Number of fixed prefix tokens before the random context "
"in a request. "
"The total input length is the sum of `random-prefix-len` and "
"a random "
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."),
)
hf_group = parser.add_argument_group("hf dataset options")
hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
hf_group.add_argument("--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset.")
hf_group.add_argument("--hf-split",
type=str,
default=None,
help="Split of the HF dataset.")
hf_group.add_argument(
"--hf-output-len",
type=int,
default=None,
help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
help="Output length for each request. Overrides the output lengths "
"from the sampled HF dataset.",
)
sampling_group = parser.add_argument_group("sampling parameters")
@@ -1115,58 +1128,52 @@ if __name__ == "__main__":
"--top-p",
type=float,
default=None,
help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.",
)
help="Top-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.",
)
help="Top-k sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.",
)
help="Min-p sampling parameter. Only has effect on openai-compatible "
"backends.")
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0).",
)
"decoding (i.e. temperature==0.0).")
parser.add_argument(
"--tokenizer-mode",
'--tokenizer-mode',
type=str,
default="auto",
choices=["auto", "slow", "mistral", "custom"],
choices=['auto', 'slow', 'mistral', 'custom'],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
"always use the slow tokenizer. \n* "
'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer. \n*'
'"custom" will use --tokenizer to select the preregistered tokenizer.',
)
'"custom" will use --tokenizer to select the preregistered tokenizer.')
parser.add_argument(
"--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ",
)
parser.add_argument("--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")
parser.add_argument(
"--lora-modules",
nargs="+",
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.",
)
parser.add_argument("--lora-modules",
nargs='+',
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.")
args = parser.parse_args()

View File

@@ -7,4 +7,4 @@ tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint4
reasoning_parser: ernie-45-vl
reasoning_parser: ernie-45-vl

View File

@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"
pd_comm_port: "2333"

View File

@@ -3,4 +3,3 @@ max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
quantization: wint4

View File

@@ -10,4 +10,4 @@ engine_worker_queue_port: 6677
num_gpu_blocks_override: 1024
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
pd_comm_port: "2334"
pd_comm_port: "2334"

View File

@@ -10,4 +10,4 @@ splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
pd_comm_port: "2334"

View File

@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"
pd_comm_port: "2333"

View File

@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3
max_long_partial_prefills: 3

View File

@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"
pd_comm_port: "2333"

View File

@@ -3,4 +3,3 @@ max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 8
quantization: wint8

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 96
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 4
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wfp8afp8
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint8
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -2,5 +2,4 @@ max_model_len: 32768
max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,5 +3,4 @@ max_num_seqs: 128
kv_cache_ratio: 0.75
tensor_parallel_size: 1
quantization: wint4
graph_optimization_config:
graph_opt_level: 1
enable_static_graph_inference: True

View File

@@ -3,4 +3,4 @@ max_num_seqs: 75
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
quantization: wint4
tensor_parallel_size: 4
tensor_parallel_size: 4

View File

@@ -3,4 +3,4 @@ max_num_seqs: 25
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
tensor_parallel_size: 4
tensor_parallel_size: 4

View File

@@ -1,3 +1,3 @@
metadata:
min_tokens: 32
max_tokens: 33
max_tokens: 33

View File

@@ -5,4 +5,4 @@ metadata:
max_tokens: 12288
repetition_penalty: 1.05
frequency_penalty: 0
presence_penalty: 0
presence_penalty: 0

View File

@@ -5,4 +5,4 @@ metadata:
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 1.5
presence_penalty: 1.5

View File

@@ -1,11 +0,0 @@
top_p: 1.0
temperature: 1.0
metadata:
min_tokens: 1
max_tokens: 30721
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0
skip_special_tokens: false
chat_template_kwargs:
enable_thinking: true

View File

@@ -3,4 +3,4 @@ max_num_seqs: 64
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint8
reasoning_parser: ernie-x1
reasoning_parser: ernie-x1

View File

@@ -18,9 +18,6 @@ BUILD_WHEEL=${1:-1}
PYTHON_VERSION=${2:-"python"}
export python=$PYTHON_VERSION
FD_CPU_USE_BF16=${3:-"false"}
# FD_BUILDING_ARCS: Specify target CUDA architectures for custom ops, e.g., "[80, 90, 100]".
# For SM90 (Hopper), use 90. For SM100 (Blackwell), use 100.
# These will be translated to 90a / 100a in setup_ops.py for specific features.
FD_BUILDING_ARCS=${4:-""}
@@ -77,10 +74,8 @@ function copy_ops(){
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
if [ "$is_rocm" = "True" ]; then
DEVICE_TYPE="rocm"
mkdir -p ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "BASE and ROCM ops have been copy to fastdeploy"
echo -e "ROCM ops have been copy to fastdeploy"
return
fi
mkdir -p ../fastdeploy/model_executor/ops/base
@@ -109,23 +104,6 @@ function copy_ops(){
return
fi
if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
if [ "$if_corex" = "True" ]; then
DEVICE_TYPE="iluvatar-gpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
return
fi
is_gcu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('gcu'))"`
if [ "$is_gcu" = "True" ]; then
DEVICE_TYPE="gcu"
cp -r ${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gcu
echo -e "gcu ops have been copy to fastdeploy"
return
fi
DEVICE_TYPE="cpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cd ../../../../
@@ -185,6 +163,17 @@ function build_and_install() {
exit 1
fi
echo -e "${BLUE}[build]${NONE} ${GREEN}build fastdeploy wheel success${NONE}\n"
echo -e "${BLUE}[install]${NONE} installing fastdeploy..."
cd $DIST_DIR
find . -name "fastdeploy*.whl" | xargs ${python} -m pip install --force-reinstall --no-cache-dir
if [ $? -ne 0 ]; then
cd ..
echo -e "${RED}[FAIL]${NONE} install fastdeploy wheel failed"
exit 1
fi
echo -e "${BLUE}[install]${NONE} ${GREEN}fastdeploy install success${NONE}\n"
cd ..
}
function version_info() {
@@ -192,10 +181,7 @@ function version_info() {
fastdeploy_git_commit_id=$(git rev-parse HEAD)
paddle_version=$(${python} -c "import paddle; print(paddle.__version__)")
paddle_git_commit_id=$(${python} -c "import paddle; print(paddle.__git_commit__)")
cuda_version="nvcc-not-installed"
if command -v nvcc &> /dev/null; then
cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
fi
cuda_version=$(nvcc -V | grep -Po "(?<=release )[\d.]+(?=, V)")
cxx_version=$(g++ --version | head -n 1 | grep -Po "(?<=\) )[\d.]+")
echo "fastdeploy GIT COMMIT ID: $fastdeploy_git_commit_id" > $output_file

View File

@@ -46,8 +46,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -165,8 +165,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
seq_lens_this_time,
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
lambda_batch_ids,
lambda_tile_ids_per_batch,
@@ -202,8 +202,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
seq_lens_this_time,
seq_lens_encoder,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
kv_batch_ids,
kv_tile_ids_per_batch,
@@ -274,8 +274,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
qkv, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
rotary_embs,
qkv_out_scales,
@@ -297,8 +297,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
qkv_out, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
rotary_embs,
qkv_out_scales,
@@ -322,8 +322,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
qkv, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
rotary_embs,
qkv_out_scales,
@@ -346,8 +346,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
qkv_out, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
rotary_embs,
qkv_out_scales,
@@ -403,8 +403,8 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -462,7 +462,7 @@ std::vector<paddle::Tensor> AppendAttention(
meta_data.max_blocks_per_seq = block_tables.dims()[1];
meta_data.block_size = key_cache.dims()[2];
meta_data.batch_size = seq_lens_this_time.dims()[0];
meta_data.batch_size = cum_offsets.dims()[0];
auto dispatch_by_template = [&](auto temp_args) -> std::vector<paddle::Tensor> {
return AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
@@ -473,8 +473,8 @@ std::vector<paddle::Tensor> AppendAttention(
seq_lens_encoder,
seq_lens_decoder,
seq_lens_this_time,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
encoder_batch_ids,
encoder_tile_ids_per_batch,
@@ -550,8 +550,8 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
const std::vector<int64_t>& seq_lens_encoder_shape,
const std::vector<int64_t>& seq_lens_decoder_shape,
const std::vector<int64_t>& seq_lens_this_time_shape,
const std::vector<int64_t>& batch_id_per_token_shape,
const std::vector<int64_t>& cu_seqlens_q_shape,
const std::vector<int64_t>& padding_offsets_shape,
const std::vector<int64_t>& cum_offsets_shape,
const std::vector<int64_t>& block_tables_shape,
const std::vector<int64_t>& encoder_batch_ids_shape,
const std::vector<int64_t>& encoder_tile_ids_per_batch_shape,
@@ -610,8 +610,8 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
const paddle::DataType& seq_lens_encoder_dtype,
const paddle::DataType& seq_lens_decoder_dtype,
const paddle::DataType& seq_lens_this_time_dtype,
const paddle::DataType& batch_id_per_token_dtype,
const paddle::DataType& cu_seqlens_q_dtype,
const paddle::DataType& padding_offsets_dtype,
const paddle::DataType& cum_offsets_dtype,
const paddle::DataType& block_tables_dtype,
const paddle::DataType& encoder_batch_ids_dtype,
const paddle::DataType& encoder_tile_ids_per_batch_dtype,
@@ -688,8 +688,8 @@ PD_BUILD_STATIC_OP(append_attention)
"seq_lens_encoder",
"seq_lens_decoder",
"seq_lens_this_time",
"batch_id_per_token",
"cu_seqlens_q",
"padding_offsets",
"cum_offsets",
"block_tables",
"encoder_batch_ids",
"encoder_tile_ids_per_batch",

View File

@@ -41,7 +41,7 @@ __global__ void multi_query_append_attention_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -114,7 +114,8 @@ __global__ void multi_query_append_attention_kernel(
const uint32_t kv_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_b_stride = HEAD_DIM;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block =
(tile_id * NUM_WARPS + wid) * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -404,7 +405,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -476,7 +477,8 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const uint32_t kv_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_b_stride = HEAD_DIM;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
q_head_idx * HEAD_DIM +
@@ -773,8 +775,8 @@ void MultiQueryAppendAttention(
const paddle::Tensor &seq_lens_q,
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const paddle::Tensor &batch_ids,
const paddle::Tensor &tile_ids_per_batch,
@@ -880,7 +882,7 @@ void MultiQueryAppendAttention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -937,7 +939,7 @@ void MultiQueryAppendAttention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -972,7 +974,7 @@ void MultiQueryAppendAttention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1007,8 +1009,7 @@ void MultiQueryAppendAttention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1102,7 +1103,7 @@ void MultiQueryAppendAttention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1170,7 +1171,7 @@ void MultiQueryAppendAttention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1206,7 +1207,7 @@ void MultiQueryAppendAttention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1241,8 +1242,7 @@ void MultiQueryAppendAttention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1289,8 +1289,8 @@ void CascadeAppendAttentionC16Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -1352,8 +1352,8 @@ void CascadeAppendAttentionC16Kernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,

View File

@@ -46,7 +46,7 @@ __global__ void multi_query_append_attention_c4_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -144,7 +144,8 @@ __global__ void multi_query_append_attention_c4_kernel(
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
const uint32_t kv_b_stride = HEAD_DIM / 2;
const uint32_t kv_d_stride = BLOCK_SIZE / 2;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block =
(tile_id * NUM_WARPS + wid) * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -503,7 +504,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -600,7 +601,8 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
const uint32_t kv_b_stride = HEAD_DIM / 2;
const uint32_t kv_d_stride = BLOCK_SIZE / 2;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
q_head_idx * HEAD_DIM +
@@ -960,8 +962,8 @@ void MultiQueryAppendC4Attention(
const paddle::Tensor &seq_lens_q,
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const paddle::Tensor &batch_ids,
const paddle::Tensor &tile_ids_per_batch,
@@ -1086,7 +1088,7 @@ void MultiQueryAppendC4Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1149,7 +1151,7 @@ void MultiQueryAppendC4Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1184,7 +1186,7 @@ void MultiQueryAppendC4Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1219,8 +1221,7 @@ void MultiQueryAppendC4Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1332,7 +1333,7 @@ void MultiQueryAppendC4Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1408,7 +1409,7 @@ void MultiQueryAppendC4Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1443,7 +1444,7 @@ void MultiQueryAppendC4Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1478,8 +1479,7 @@ void MultiQueryAppendC4Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1526,8 +1526,8 @@ void CascadeAppendAttentionC4Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -1593,8 +1593,8 @@ void CascadeAppendAttentionC4Kernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,

View File

@@ -46,7 +46,7 @@ __global__ void multi_query_append_attention_c8_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -151,7 +151,8 @@ __global__ void multi_query_append_attention_c8_kernel(
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_b_stride = HEAD_DIM;
const uint32_t kv_d_stride = BLOCK_SIZE;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block =
(tile_id * NUM_WARPS + wid) * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
@@ -472,7 +473,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
const int *__restrict__ seq_lens_kv,
const int *__restrict__ batch_ids,
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -574,7 +575,8 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
const uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
const uint32_t kv_b_stride = HEAD_DIM;
const uint32_t kv_d_stride = BLOCK_SIZE;
const uint32_t q_start_seq_id = cu_seqlens_q[batch_id];
const uint32_t q_start_seq_id =
batch_id * max_seq_len - __ldg(&cum_offsets[batch_id]);
const uint32_t q_base_seq_id_this_block = tile_id * num_frags_x * 16;
const uint32_t q_offset = q_start_seq_id * q_ori_n_stride +
q_head_idx * HEAD_DIM +
@@ -897,8 +899,8 @@ void MultiQueryAppendC8Attention(
const paddle::Tensor &seq_lens_q,
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const paddle::Tensor &batch_ids,
const paddle::Tensor &tile_ids_per_batch,
@@ -1052,7 +1054,7 @@ void MultiQueryAppendC8Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1109,7 +1111,7 @@ void MultiQueryAppendC8Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1144,7 +1146,7 @@ void MultiQueryAppendC8Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1179,8 +1181,7 @@ void MultiQueryAppendC8Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1316,7 +1317,7 @@ void MultiQueryAppendC8Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1386,7 +1387,7 @@ void MultiQueryAppendC8Attention(
seq_lens_kv.data<int>(),
batch_ids.data<int>(),
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -1416,7 +1417,7 @@ void MultiQueryAppendC8Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1451,8 +1452,7 @@ void MultiQueryAppendC8Attention(
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
seq_lens_encoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
@@ -1499,8 +1499,8 @@ void CascadeAppendAttentionC8Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -1564,8 +1564,8 @@ void CascadeAppendAttentionC8Kernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,

View File

@@ -1852,7 +1852,7 @@ __global__ void merge_multi_chunks_kernel(
const float* __restrict__ multi_d, // [token_num, num_chunks, num_heads]
const int* __restrict__ seq_lens_q,
const int* __restrict__ seq_lens_kv,
const int* __restrict__ batch_id_per_token,
const int* __restrict__ padding_offsets,
const T* __restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T* __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
T* __restrict__ out,
@@ -1866,7 +1866,8 @@ __global__ void merge_multi_chunks_kernel(
const int head_dim) {
const int vid = threadIdx.x, hid = threadIdx.y;
const int qid = blockIdx.x;
const uint32_t bid = batch_id_per_token[qid];
const uint32_t ori_token_id = qid + padding_offsets[qid];
const uint32_t bid = ori_token_id / max_seq_len;
if (seq_lens_q[bid] <= 0 || seq_lens_kv[bid] <= 0) {
return;
}
@@ -2110,7 +2111,7 @@ __global__ void merge_multi_chunks_decoder_kernel(
const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_kv,
const int *__restrict__ seq_lens_encoder,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ cum_offsets,
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
OutT *__restrict__ out,
@@ -2126,7 +2127,7 @@ __global__ void merge_multi_chunks_decoder_kernel(
const int bid = blockIdx.x, hid = blockIdx.y;
__shared__ T smem[bdy * HEAD_DIM];
__shared__ float md_smem[bdy * 2];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int seq_len_q = seq_lens_q[bid];
if (seq_len_q == 0) return;
int seq_len_kv = seq_lens_kv[bid];
@@ -2239,8 +2240,7 @@ __global__ void merge_multi_chunks_v2_kernel(
const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_kv,
const int *__restrict__ seq_lens_encoder,
const int *__restrict__ batch_id_per_token,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ padding_offsets,
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
OutT *__restrict__ out,
@@ -2259,8 +2259,9 @@ __global__ void merge_multi_chunks_v2_kernel(
__shared__ T smem[bdy * HEAD_DIM];
__shared__ float md_smem[bdy * 2];
for (int qid = blockIdx.x; qid < token_num; qid += gridDim.x) {
const uint32_t bid = batch_id_per_token[qid];
const uint32_t local_seq_id = qid - cu_seqlens_q[bid];
const uint32_t ori_token_id = qid + padding_offsets[qid];
const uint32_t bid = ori_token_id / max_seq_len;
const uint32_t local_seq_id = ori_token_id % max_seq_len;
const int seq_len_q = seq_lens_q[bid];
if (seq_len_q == 0) continue;
int seq_len_kv = seq_lens_kv[bid];

View File

@@ -40,8 +40,8 @@ void CascadeAppendAttentionC16Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -85,8 +85,8 @@ void CascadeAppendAttentionC8Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -130,8 +130,8 @@ void CascadeAppendAttentionC4Kernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -175,8 +175,8 @@ void CascadeAppendAttentionKernel(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -211,8 +211,8 @@ void CascadeAppendAttentionKernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,
@@ -246,8 +246,8 @@ void CascadeAppendAttentionKernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,
@@ -281,8 +281,8 @@ void CascadeAppendAttentionKernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,
@@ -316,8 +316,8 @@ void CascadeAppendAttentionKernel(
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_table,
batch_ids,
tile_ids_per_batch,

View File

@@ -35,7 +35,7 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
const T * __restrict__ multi_d, // [bsz, num_chunks, num_heads]
const int * __restrict__ seq_lens_q,
const int * __restrict__ seq_lens_kv,
const int * __restrict__ cu_seqlens_q,
const int * __restrict__ cum_offsets,
const T * __restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
OutT * __restrict__ out, // [token_num, num_heads, head_dim]
@@ -59,7 +59,7 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
__shared__ T smem[bdy * HEAD_DIM];
__shared__ T md_smem[bdy * 2];
const int start_token_ids = cu_seqlens_q[qid];
const int start_token_ids = qid * max_seq_len - __ldg(&cum_offsets[qid]);
using LoadT = AlignedVector<T, vec_size>;
LoadT load_vec;
LoadT res_vec;
@@ -134,7 +134,7 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
const int * __restrict__ seq_lens_q,
const int * __restrict__ seq_lens_kv,
const int * __restrict__ cu_seqlens_q,
const int * __restrict__ cum_offsets,
const int * __restrict__ block_table, // [bsz, block_num_per_seq]
const int max_seq_len,
const int max_dec_len,
@@ -171,8 +171,8 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
}
kv_len += q_len;
const uint32_t num_chunk_this_seq = div_up(kv_len, chunk_size);
const uint32_t q_start_idx = cu_seqlens_q[bid];
const uint32_t q_write_idx = cu_seqlens_q[bid];
const uint32_t q_start_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
const uint32_t q_write_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (chunk_id >= num_chunk_this_seq) {
return;
}
@@ -317,8 +317,8 @@ void MultiQueryDecoderAttention(
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q,
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const int max_seq_len,
const int max_dec_len,
@@ -393,7 +393,7 @@ void MultiQueryDecoderAttention(
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -430,7 +430,7 @@ void MultiQueryDecoderAttention(
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_dec_len,
@@ -456,7 +456,7 @@ void MultiQueryDecoderAttention(
reinterpret_cast<NV_TYPE*>(tmp_d->ptr()),
seq_lens_q.data<int>(),
seq_lens_kv.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
reinterpret_cast<NV_TYPE*>(const_cast<T*>(shift_bias_ptr)),
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
reinterpret_cast<NV_TYPE*>(const_cast<T*>(out->data<T>())),
@@ -483,8 +483,8 @@ void DecodeMLAAttentionKernel(
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,
@@ -513,7 +513,7 @@ void DecodeMLAAttentionKernel(
{DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE,
{DISPATCH_DEAL_EACH_TIME(deal_each_time, DEAL_EACH_TIME,
{MultiQueryDecoderAttention<T, GROUP_SIZE, HEAD_DIM_QK, HEAD_DIM_V, BLOCK_SIZE, CAUSAL, 2, 16, DEAL_EACH_TIME>(
meta_data, stream, q, cache_k, cache_v, attn_mask, shift_bias, smooth_weight, seq_lens_q, seq_lens_kv, batch_id_per_token, cu_seqlens_q,
meta_data, stream, q, cache_k, cache_v, attn_mask, shift_bias, smooth_weight, seq_lens_q, seq_lens_kv, padding_offsets, cum_offsets,
block_table, max_seq_len, max_dec_len, rope_scale, rope_theta, softmax_scale, in_scale, out);})})})})})});
}
@@ -527,8 +527,8 @@ template void DecodeMLAAttentionKernel<paddle::bfloat16>(
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,
@@ -548,8 +548,8 @@ template void DecodeMLAAttentionKernel<paddle::float16>(
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,

View File

@@ -28,8 +28,8 @@ __global__ void append_decode_cache_T_rope_kernel(
// head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -65,7 +65,7 @@ __global__ void append_decode_cache_T_rope_kernel(
const int bias = linear_index % hidden_size;
const int hi = bias / head_size; // q + k + v
const int h_bias = bias % head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
if (seq_lens_encoder[ori_bi] > 0) return;
const int write_seq_id = seq_lens[ori_bi];
if (write_seq_id == 0) continue;
@@ -134,8 +134,8 @@ __global__ void append_decode_cache_T_rope_kernel(
// head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -177,7 +177,7 @@ __global__ void append_decode_cache_T_rope_kernel(
const int bias = linear_index % hidden_size;
const int hi = bias / head_size; // q + k + v
const int h_bias = bias % head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
if (seq_lens_encoder[ori_bi] > 0) return;
const int write_seq_id = seq_lens[ori_bi];
if (write_seq_id == 0) continue;
@@ -254,8 +254,8 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
// head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -293,7 +293,7 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
const int bias = linear_index % half_hidden_size;
const int hi = bias / half_head_size; // q + k + v
const int h_bias = bias % half_head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
if (seq_lens_encoder[ori_bi] > 0) return;
const int write_seq_id = seq_lens[ori_bi];
if (write_seq_id == 0) continue;
@@ -366,8 +366,8 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
// head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -409,7 +409,7 @@ __global__ void append_decode_cache_T_neox_rope_kernel(
const int bias = linear_index % half_hidden_size;
const int hi = bias / half_head_size; // q + k + v
const int h_bias = bias % half_head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
if (seq_lens_encoder[ori_bi] > 0) return;
const int write_seq_id = seq_lens[ori_bi];
if (write_seq_id == 0) continue;
@@ -498,8 +498,8 @@ __global__ void append_decode_cache_int8_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -523,7 +523,7 @@ __global__ void append_decode_cache_int8_rope_kernel(
int q_head_idx, k_head_idx, v_idx;
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -745,8 +745,8 @@ __global__ void append_decode_cache_int8_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -775,7 +775,7 @@ __global__ void append_decode_cache_int8_rope_kernel(
int q_head_idx, k_head_idx, v_idx;
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -1047,8 +1047,8 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -1073,7 +1073,7 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
int q_head_idx, k_head_idx, v_idx;
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -1346,8 +1346,8 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -1377,7 +1377,7 @@ __global__ void append_decode_cache_int8_neox_rope_kernel(
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -1739,8 +1739,8 @@ __global__ void append_decode_cache_int4_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -1766,7 +1766,7 @@ __global__ void append_decode_cache_int4_rope_kernel(
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int half_block_size = block_size / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -2034,8 +2034,8 @@ __global__ void append_decode_cache_int4_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -2066,7 +2066,7 @@ __global__ void append_decode_cache_int4_rope_kernel(
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int half_block_size = block_size / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -2362,8 +2362,8 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -2389,7 +2389,7 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int half_block_size = block_size / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;
@@ -2732,8 +2732,8 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -2764,7 +2764,7 @@ __global__ void append_decode_cache_int4_neox_rope_kernel(
const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * HeadDim;
constexpr int half_head_size = HeadDim / 2;
const int half_block_size = block_size / 2;
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - __ldg(&cum_offsets[bid]);
if (seq_lens_encoder[bid] > 0) return;
const int write_seq_id = seq_lens[bid];
if (write_seq_id == 0) return;

View File

@@ -21,8 +21,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
T* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -57,8 +57,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -79,8 +79,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -102,8 +102,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -125,8 +125,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -149,8 +149,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -182,8 +182,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -207,8 +207,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -232,8 +232,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -257,8 +257,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -282,8 +282,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -317,8 +317,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -344,8 +344,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -371,8 +371,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -398,8 +398,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -424,8 +424,8 @@ void DecoderWriteCacheWithRoPEKernel(
const paddle::Tensor& qkv,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -471,8 +471,8 @@ void DecoderWriteCacheWithRoPEKernel(
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -503,8 +503,8 @@ void DecoderWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -536,8 +536,8 @@ void DecoderWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -570,8 +570,8 @@ void DecoderWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -603,8 +603,8 @@ void DecoderWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -650,8 +650,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -677,8 +677,8 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -703,8 +703,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -729,8 +729,8 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,

View File

@@ -23,8 +23,8 @@ void DecoderWriteCacheWithRoPEKernel(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -40,4 +40,4 @@ void DecoderWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out);

View File

@@ -23,8 +23,7 @@ __global__ void VariableLengthRotaryKernel(
const int *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const float *qkv_out_scales, // [3, num_head, dim_head]
@@ -53,7 +52,8 @@ __global__ void VariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int qkv_id = bias / hidden_size;
@@ -61,7 +61,7 @@ __global__ void VariableLengthRotaryKernel(
const int hi = qkv_bias / last_dim;
const int h_bias = qkv_bias % last_dim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias;
@@ -107,8 +107,7 @@ __global__ void VariableLengthRotaryKernel(
const T *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
T *qkv_out,
@@ -131,7 +130,8 @@ __global__ void VariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int qkv_id = bias / hidden_size;
@@ -139,7 +139,7 @@ __global__ void VariableLengthRotaryKernel(
const int hi = qkv_bias / last_dim;
const int h_bias = qkv_bias % last_dim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int64_t base_idx = token_idx * 3 * hidden_size +
@@ -167,8 +167,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
const int *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const float *qkv_out_scales, // [3, num_head, dim_head]
@@ -200,7 +199,8 @@ __global__ void NeoxVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int qkv_id = bias / hidden_size;
@@ -208,7 +208,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
const int hi = qkv_bias / half_lastdim;
const int h_bias = qkv_bias % half_lastdim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * last_dim + h_bias;
const int bias_idx_left =
@@ -261,8 +261,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
const T *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
T *qkv_out,
@@ -286,7 +285,8 @@ __global__ void NeoxVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int qkv_id = bias / hidden_size;
@@ -294,7 +294,7 @@ __global__ void NeoxVariableLengthRotaryKernel(
const int hi = qkv_bias / half_lastdim;
const int h_bias = qkv_bias % half_lastdim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * last_dim + h_bias;
const int base_idx_left = token_idx * 3 * full_hidden_size +
@@ -327,8 +327,7 @@ __global__ void GQAVariableLengthRotaryKernel(
const int *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const float *qkv_out_scales, // [3, q_num_head, dim_head]
@@ -358,13 +357,14 @@ __global__ void GQAVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];;
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int64_t bias_idx = hi * last_dim + h_bias;
@@ -410,8 +410,7 @@ __global__ void GQAVariableLengthRotaryKernel(
const T *qkv,
const float *cos_emb,
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
T *qkv_out,
@@ -435,13 +434,14 @@ __global__ void GQAVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];;
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int64_t base_idx =
@@ -472,8 +472,7 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const int *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const float *qkv_out_scales,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const T *qkv_biases,
@@ -505,13 +504,15 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const int *qkv,
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;
int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
int ori_seq_id;
ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int64_t bias_idx = hi * last_dim + h_bias;
@@ -560,8 +561,7 @@ template <typename T, int VecSize = 1>
__global__ void GQAVariableLengthRotaryQuantKVKernel(const T *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const T *qkv_biases,
@@ -590,13 +590,15 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(const T *qkv,
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;
int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
int ori_seq_id;
ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
const int64_t bias_idx = hi * last_dim + h_bias;
@@ -643,8 +645,7 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
const int *qkv,
const float *cos_emb, // [1, 1, seq_len, dim_head / 2]
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const float *qkv_out_scales, // [3, q_num_head, dim_head]
@@ -675,13 +676,14 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / half_lastdim;
const int h_bias = bias % half_lastdim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * last_dim + h_bias;
const int bias_idx_left = hi * last_dim + h_bias;
@@ -734,8 +736,7 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
const T *qkv,
const float *cos_emb,
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const float *qkv_out_scales,
@@ -760,13 +761,14 @@ __global__ void GQANeoxVariableLengthRotaryKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens && seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / half_lastdim;
const int h_bias = bias % half_lastdim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int emb_idx = ori_seq_id * last_dim + h_bias;
const int base_idx_left =
@@ -803,8 +805,7 @@ __global__ void cache_kernel(
T *__restrict__ value_cache, // [num_blocks, kv_num_heads, block_size,
// head_size]
const int *__restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int *__restrict__ batch_id_per_token, // [num_tokens]
const int *__restrict__ cu_seqlens_q, // [bsz]
const int *__restrict__ padding_offsets, // [num_tokens]
const int *__restrict__ seq_lens, // [bsz]
const int *__restrict__ seq_lens_decoder, // [bsz]
const int max_seq_len,
@@ -830,9 +831,11 @@ __global__ void cache_kernel(
const uint32_t qkv_bias = bias % hidden_size;
const uint32_t hi = qkv_bias / head_size;
const uint32_t h_bias = qkv_bias % head_size;
const uint32_t ori_bi = batch_id_per_token[token_idx];
const uint32_t ori_token_idx = token_idx + padding_offsets[token_idx];
const uint32_t ori_bi = ori_token_idx / max_seq_len;
if (seq_lens[ori_bi] == 0) continue;
const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const uint32_t ori_seq_id =
ori_token_idx % max_seq_len + seq_lens_decoder[ori_bi];
const int32_t *block_table_now = nullptr;
@@ -875,8 +878,8 @@ __global__ void append_write_cache_kv_c8_qkv(
const int *__restrict__ tile_ids,
const int *__restrict__ seq_lens_this_time,
const int *__restrict__ seq_lens_decoder,
const int *__restrict__ batch_id_per_token,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ padding_offsets,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_tables,
const int max_seq_len,
const int max_blocks_per_seq,
@@ -906,46 +909,15 @@ __global__ void append_write_cache_kv_c8_qkv(
const uint32_t end_len = start_len + seq_len_this_time;
const uint32_t tile_start = start_len_pad + tile_id * num_rows_per_block;
int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
uint32_t chunk_start = tile_start + wid * num_frags_z * 16 + tid / 8;
const uint32_t start_token_idx = cu_seqlens_q[batch_id];
const uint32_t start_token_idx =
batch_id * max_seq_len - cum_offsets[batch_id];
const uint32_t kv_batch_stride = (num_heads + 2 * kv_num_heads) * HEAD_DIM;
const uint32_t kv_h_stride = HEAD_DIM;
__shared__ T k_smem_ori[num_rows_per_block * HEAD_DIM];
__shared__ T v_smem_ori[num_rows_per_block * HEAD_DIM];
if (tile_start >= start_len) {
constexpr int KV_VEC_SIZE = 16 / sizeof(uint8_t); // 16
using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
// int lane_id = wid * 32 + tid;
// pad zero for this kv_head_idx for this block
LoadPadKVT pad_cache_vec;
*(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
// reset k
constexpr int num_vecs_per_head_k = HEAD_DIM / KV_VEC_SIZE;
constexpr int num_token_each_time_k = 32 / num_vecs_per_head_k;
uint32_t tgt_idx =
(block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE * HEAD_DIM +
tid % num_vecs_per_head_k * KV_VEC_SIZE;
for (int block_i = tid / num_vecs_per_head_k;
block_i < BLOCK_SIZE;
block_i += num_token_each_time_k) {
Store<uint8_t, KV_VEC_SIZE>(pad_cache_vec,
&cache_k[tgt_idx + block_i * HEAD_DIM]);
}
// reset v
const int num_vecs_per_head_v = BLOCK_SIZE / KV_VEC_SIZE;
const int num_token_each_time_v = 32 / num_vecs_per_head_v;
tgt_idx =
(block_id * kv_num_heads + kv_head_idx) * HEAD_DIM * BLOCK_SIZE +
tid % num_vecs_per_head_v * KV_VEC_SIZE;
for (int block_i = tid / num_vecs_per_head_v; block_i < HEAD_DIM;
block_i += num_token_each_time_v) {
Store<uint8_t, KV_VEC_SIZE>(
pad_cache_vec, &cache_v[tgt_idx + block_i * BLOCK_SIZE]);
}
}
smem_t k_smem(k_smem_ori);
smem_t v_smem(v_smem_ori);
@@ -1008,6 +980,7 @@ __global__ void append_write_cache_kv_c8_qkv(
uint32_t chunk_start_k = tile_start + wid * num_frags_z * 16 + tid / 4;
uint32_t kv_frag[4];
int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
const uint32_t write_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM;
const uint32_t write_h_stride = BLOCK_SIZE * HEAD_DIM;
const uint32_t write_b_stride = HEAD_DIM;
@@ -1145,8 +1118,8 @@ __global__ void append_write_cache_kv_c4_qkv(
const int *__restrict__ tile_ids,
const int *__restrict__ seq_lens_this_time,
const int *__restrict__ seq_lens_decoder,
const int *__restrict__ batch_id_per_token,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ padding_offsets,
const int *__restrict__ cum_offsets,
const int *__restrict__ block_tables,
const int max_seq_len,
const int max_blocks_per_seq,
@@ -1175,46 +1148,10 @@ __global__ void append_write_cache_kv_c4_qkv(
const uint32_t tile_start = start_len_pad + tile_id * num_rows_per_block;
uint32_t chunk_start = tile_start + wid * num_frags_z * 16 + tid / 8;
const uint32_t start_token_idx = cu_seqlens_q[batch_id];
const uint32_t start_token_idx =
batch_id * max_seq_len - cum_offsets[batch_id];
const uint32_t kv_batch_stride = (num_heads + 2 * kv_num_heads) * HEAD_DIM;
const uint32_t kv_h_stride = HEAD_DIM;
int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
const uint32_t HEAD_DIM_HALF = HEAD_DIM / 2;
const uint32_t BLOCK_SIZE_HALF = BLOCK_SIZE / 2;
if (tile_start >= start_len) {
constexpr int KV_VEC_SIZE = 16 / sizeof(uint8_t); // 16
using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
// pad zero for this kv_head_idx for this block
LoadPadKVT pad_cache_vec;
*(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
// reset k
constexpr int num_vecs_per_head_k = HEAD_DIM_HALF / KV_VEC_SIZE; // 4
constexpr int num_token_each_time_k = 32 / num_vecs_per_head_k; // 8
uint32_t tgt_idx =
(block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE * HEAD_DIM_HALF +
tid % num_vecs_per_head_k * KV_VEC_SIZE;
for (int block_i = tid / num_vecs_per_head_k;
block_i < BLOCK_SIZE;
block_i += num_token_each_time_k) {
Store<uint8_t, KV_VEC_SIZE>(pad_cache_vec,
&cache_k[tgt_idx + block_i * HEAD_DIM_HALF]);
}
// reset v
const int num_vecs_per_head_v = BLOCK_SIZE_HALF / KV_VEC_SIZE; // 2
const int num_token_each_time_v = 32 / num_vecs_per_head_v; // 16
tgt_idx =
(block_id * kv_num_heads + kv_head_idx) * HEAD_DIM * BLOCK_SIZE_HALF +
tid % num_vecs_per_head_v * KV_VEC_SIZE;
for (int block_i = tid / num_vecs_per_head_v; block_i < HEAD_DIM;
block_i += num_token_each_time_v) {
Store<uint8_t, KV_VEC_SIZE>(
pad_cache_vec, &cache_v[tgt_idx + block_i * BLOCK_SIZE_HALF]);
}
}
__shared__ T k_smem_ori[num_rows_per_block * HEAD_DIM];
__shared__ T v_smem_ori[num_rows_per_block * HEAD_DIM];
__shared__ T k_scale_smem[HEAD_DIM];
@@ -1325,6 +1262,7 @@ __global__ void append_write_cache_kv_c4_qkv(
uint32_t chunk_start_k = tile_start + wid * num_frags_z * 16 + tid / 4;
uint32_t kv_frag[4];
int block_id = __ldg(&block_table_now[tile_start / BLOCK_SIZE]);
const uint32_t write_n_stride = kv_num_heads * BLOCK_SIZE * HEAD_DIM / 2;
const uint32_t write_h_stride = BLOCK_SIZE * HEAD_DIM / 2;
const uint32_t write_b_stride = HEAD_DIM / 2;
@@ -1469,8 +1407,7 @@ void rotary_qk_variable(
const float *qkv_out_scales, // [3, num_head, dim_head]
const T *qkv_bias,
const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2]
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const int token_num,
@@ -1502,8 +1439,7 @@ void rotary_qk_variable(
reinterpret_cast<const int *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out_scales,
@@ -1519,8 +1455,7 @@ void rotary_qk_variable(
reinterpret_cast<const T *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out,
@@ -1538,8 +1473,7 @@ void rotary_qk_variable(
reinterpret_cast<const int *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out_scales,
@@ -1555,8 +1489,7 @@ void rotary_qk_variable(
reinterpret_cast<const T *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out,
@@ -1575,8 +1508,7 @@ void gqa_rotary_qk_variable(
const float *qkv_out_scales, // [3, num_head, dim_head]
const T *qkv_bias,
const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2]
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const int token_num,
@@ -1611,8 +1543,7 @@ void gqa_rotary_qk_variable(
reinterpret_cast<const int *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out_scales,
@@ -1630,8 +1561,7 @@ void gqa_rotary_qk_variable(
reinterpret_cast<const T *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out,
@@ -1651,8 +1581,7 @@ void gqa_rotary_qk_variable(
reinterpret_cast<const int *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out_scales,
@@ -1669,8 +1598,7 @@ void gqa_rotary_qk_variable(
reinterpret_cast<const T *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_out_scales,
@@ -1694,8 +1622,7 @@ void gqa_rotary_qk_quant_variable(
const T *cache_k_scales,
const T *cache_v_scales,
const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2]
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const int token_num,
@@ -1727,8 +1654,7 @@ void gqa_rotary_qk_quant_variable(
cos_emb,
sin_emb,
qkv_out_scales,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_bias,
@@ -1747,8 +1673,7 @@ void gqa_rotary_qk_quant_variable(
reinterpret_cast<const T *>(qkv_input),
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens,
seq_lens_decoder,
qkv_bias,
@@ -1774,8 +1699,7 @@ void CascadeAppendWriteCacheKVQKV(
&qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 *
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor &block_table,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const int max_seq_len,
@@ -1801,8 +1725,7 @@ void CascadeAppendWriteCacheKVQKV(
reinterpret_cast<T *>(key_cache_out->data<T>()),
reinterpret_cast<T *>(value_cache_out->data<T>()),
block_table.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
max_seq_len,
@@ -1826,8 +1749,8 @@ void CascadeAppendWriteCacheKVC8QKV(
const paddle::Tensor &cache_v_scale, // [num_kv_heads, head_dim]
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const paddle::Tensor &batch_ids,
const paddle::Tensor &tile_ids_per_batch,
@@ -1891,8 +1814,8 @@ void CascadeAppendWriteCacheKVC8QKV(
tile_ids_per_batch.data<int>(),
seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_blocks_per_seq,
@@ -1914,8 +1837,8 @@ void CascadeAppendWriteCacheKVC4QKV(
const paddle::Tensor &cache_v_zp, // [num_kv_heads, head_dim]
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
const paddle::Tensor &batch_ids,
const paddle::Tensor &tile_ids_per_batch,
@@ -1961,8 +1884,8 @@ void CascadeAppendWriteCacheKVC4QKV(
tile_ids_per_batch.data<int>(),
seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
block_table.data<int>(),
max_seq_len,
max_blocks_per_seq,

View File

@@ -25,8 +25,8 @@ void EncoderWriteCacheWithRopeKernel(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,
@@ -63,8 +63,7 @@ void EncoderWriteCacheWithRopeKernel(
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
@@ -83,8 +82,7 @@ void EncoderWriteCacheWithRopeKernel(
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
@@ -105,8 +103,7 @@ void EncoderWriteCacheWithRopeKernel(
cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
@@ -126,8 +123,7 @@ void EncoderWriteCacheWithRopeKernel(
CascadeAppendWriteCacheKVQKV<T>(meta_data,
*qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens_encoder,
seq_lens_decoder,
max_seq_len,
@@ -146,8 +142,8 @@ void EncoderWriteCacheWithRopeKernel(
cache_v_scale.get(),
seq_lens_this_time,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
batch_ids,
tile_ids,
@@ -173,8 +169,8 @@ void EncoderWriteCacheWithRopeKernel(
cache_v_zp.get(),
seq_lens_this_time,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
batch_ids,
tile_ids,

View File

@@ -194,26 +194,23 @@ get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
paddle::Tensor &decoder_batch_ids, // Inplace
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int group_size,
const int block_size,
const int decoder_step_token_num)
{
const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int group_size, const int block_size,
const int decoder_step_token_num) {
auto stream = seq_lens_encoder.stream();
int bsz = seq_lens_this_time.shape()[0];
paddle::Tensor max_len_tensor_gpu = GetEmptyTensor({max_len_tensor_cpu.shape()[0]}, paddle::DataType::INT32, seq_lens_this_time.place());
int bsz = cum_offsets.shape()[0];
auto max_len_tensor =
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
max_len_tensor_gpu, bsz);
max_len_tensor_cpu.copy_(max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
max_len_tensor, bsz);
auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
// max_enc_dec_len_this_time, max_just_dec_len_this_time,
// max_just_dec_merged_len_this_time, max_system_len,
// max_just_dec_len_without_system
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
auto max_len_cpu_ptr = max_len_cpu.data<int>();
int max_len_this_time = max_len_cpu_ptr[0];
int max_enc_len_this_time = max_len_cpu_ptr[1];
int max_dec_len_this_time = max_len_cpu_ptr[2];
@@ -225,11 +222,14 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
paddle::Tensor encoder_batch_ids;
paddle::Tensor encoder_tile_ids_per_batch;
paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
paddle::Tensor kv_batch_ids;
paddle::Tensor kv_tile_ids_per_batch;
paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
paddle::Tensor max_len_kv_cpu; /*cpu*/
paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
paddle::Tensor decoder_batch_ids;
paddle::Tensor decoder_tile_ids_per_batch;
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
paddle::Tensor max_len_kv_cpu; /*cpu*/
auto max_len_kv =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
@@ -291,64 +291,95 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
kv_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
}
if (max_just_dec_len_this_time > 0) {
// Clear buffer
const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
const uint32_t decoder_batch_shape = bsz * decoder_max_tile_size_per_bs_q;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_x_cpu.data<int>(), 0, sizeof(int32_t), stream));
const uint32_t decoder_max_tile_size_per_bs_q =
div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
decoder_batch_ids =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32, seq_lens_encoder.place());
decoder_tile_ids_per_batch =
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
paddle::DataType::INT32, seq_lens_encoder.place());
auto decoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(),
bsz,
decoder_block_shape_q,
seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
group_size);
decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
decoder_num_blocks_x_cpu =
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
decoder_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
decoder_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
decoder_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
}
return {encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks_x_cpu, /*cpu*/
kv_batch_ids,
kv_tile_ids_per_batch,
kv_num_blocks_x_cpu, /*cpu*/
decoder_batch_ids,
decoder_tile_ids_per_batch,
decoder_num_blocks_x_cpu, /*cpu*/
max_len_kv_cpu /*cpu*/,
max_len_cpu};
}
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
const paddle::DataType &seq_lens_encoder_dtype,
const paddle::DataType &seq_lens_decoder_dtype,
const paddle::DataType &seq_lens_this_time_dtype,
const paddle::DataType &cum_offsets_dtype) {
return {
encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks_x_cpu, /*cpu*/
kv_batch_ids,
kv_tile_ids_per_batch,
kv_num_blocks_x_cpu, /*cpu*/
max_len_kv_cpu, /*cpu*/
};
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
paddle::DataType::INT32, paddle::DataType::INT32};
}
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
const std::vector<int64_t> &seq_lens_encoder_shape,
const std::vector<int64_t> &seq_lens_decoder_shape,
const std::vector<int64_t> &seq_lens_this_time_shape,
const std::vector<int64_t> &cum_offsets_shape) {
std::vector<int64_t> dynamic_shape = {-1};
return {dynamic_shape,
dynamic_shape,
{1},
dynamic_shape,
dynamic_shape,
{1},
dynamic_shape,
dynamic_shape,
{1},
{1},
{8}};
}
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
.Inputs({
"seq_lens_encoder",
"seq_lens_decoder",
"seq_lens_this_time",
"decoder_batch_ids",
"decoder_tile_ids_per_batch",
"decoder_num_blocks_x_cpu",
"max_len_tensor_cpu"
})
.Outputs({
paddle::Optional("encoder_batch_ids"),
paddle::Optional("encoder_tile_ids_per_batch"),
paddle::Optional("encoder_num_blocks_x_cpu"),
paddle::Optional("kv_batch_ids"),
paddle::Optional("kv_tile_ids_per_batch"),
paddle::Optional("kv_num_blocks_x_cpu"),
"max_len_kv_cpu"
})
.Attrs({
"encoder_block_shape_q: int",
"decoder_block_shape_q: int",
"group_size: int",
"block_size: int",
"decoder_step_token_num: int"
})
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock));
.Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time",
"cum_offsets"})
.Outputs({paddle::Optional("encoder_batch_ids"),
paddle::Optional("encoder_tile_ids_per_batch"),
paddle::Optional("encoder_num_blocks"),
paddle::Optional("kv_batch_ids"),
paddle::Optional("kv_tile_ids_per_batch"),
paddle::Optional("kv_num_blocks"),
paddle::Optional("decoder_batch_ids"),
paddle::Optional("decoder_tile_ids_per_batch"),
paddle::Optional("decoder_num_blocks"),
paddle::Optional("max_len_kv"), "set_max_lengths"})
.Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
"group_size: int", "block_size: int",
"decoder_step_token_num: int"})
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(GetBlockShapeAndSplitKVBlockInferDtype));

View File

@@ -16,6 +16,7 @@
#include "paddle/extension.h"
#include "paddle/phi/core/memory/memcpy.h"
#include "encoder_write_cache_with_rope_impl.cuh"
#include "paddle/phi/kernels/gpu/flash_attn_v3_kernel.h"
#include "paddle/phi/backends/context_pool.h"
#include "remote_cache_kv_ipc.h"
@@ -24,8 +25,7 @@ __global__ void GQAVariableLengthRotarySplitKernel(
const T *qkv,
const float *cos_emb,
const float *sin_emb,
const int *batch_id_per_token,
const int *cu_seqlens_q,
const int *padding_offsets,
const int *seq_lens,
const int *seq_lens_decoder,
const int *cu_seqlens_k,
@@ -52,13 +52,14 @@ __global__ void GQAVariableLengthRotarySplitKernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_idx = linear_index / offset;
const int ori_bi = batch_id_per_token[token_idx];
const int ori_token_idx = token_idx + padding_offsets[token_idx];
const int ori_bi = ori_token_idx / seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % offset;
const int hi = bias / last_dim;
const int h_bias = bias % last_dim;
const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const int ori_seq_id = ori_token_idx % seq_len + seq_lens_decoder[ori_bi];
const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id;
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
@@ -107,10 +108,9 @@ void gqa_rotary_qk_split_variable(
T *v,
const T *qkv_input,
const float *rotary_emb, // [2, 1, 1, seq_len, dim_head / 2]
const int *batch_id_per_token,
const int *padding_offsets,
const int *seq_lens_encoder,
const int *seq_lens_decoder,
const int *cu_seqlens_q,
const int *cu_seqlens_k,
const int token_num,
const int num_heads,
@@ -133,8 +133,7 @@ void gqa_rotary_qk_split_variable(
qkv_input,
cos_emb,
sin_emb,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens_encoder,
seq_lens_decoder,
cu_seqlens_k,
@@ -149,188 +148,13 @@ void gqa_rotary_qk_split_variable(
dim_head);
}
template <typename T,
typename CacheT,
uint32_t HEAD_DIM,
uint32_t BLOCK_SIZE,
uint32_t NUM_WARPS=4>
__global__ void append_cache_kv_c16(
const T *__restrict__ cache_k,
const T *__restrict__ cache_v,
T *__restrict__ k_out,
T *__restrict__ v_out,
const int *__restrict__ seq_lens_this_time,
const int *__restrict__ seq_lens_decoder,
const int *__restrict__ cu_seqlens_k,
const int *__restrict__ block_tables,
const int *batch_ids,
const int *tile_ids_per_batch,
const int max_blocks_per_seq,
const int kv_num_heads) {
// start_kv_idx: start kv_idx current block
// batch_idblock's batch_id
// TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with templateint8/fp8)
const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
const uint32_t tid = threadIdx.x, wid = threadIdx.y;
const uint32_t batch_id = batch_ids[tile_idx];
const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
if (seq_lens_this_time[batch_id] <= 0) {
return;
}
const int *cur_block_table = block_tables + batch_id * max_blocks_per_seq;
uint32_t block_id = cur_block_table[start_kv_idx / BLOCK_SIZE];
// cache_kv idx
uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM;
uint32_t block_stride = kv_num_heads * kv_h_stride;
const CacheT *cur_cache_k = cache_k + block_id * block_stride + kv_head_idx * kv_h_stride;
const CacheT *cur_cache_v = cache_v + block_id * block_stride + kv_head_idx * kv_h_stride;
// k_out v_out idx
uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
uint32_t kv_frag[4];
T *frag_dq_T = reinterpret_cast<T *>(kv_frag);
constexpr uint32_t num_vecs_per_head =
HEAD_DIM / num_elems_per_128b<CacheT>();
constexpr uint32_t inv_kv_stride = 8 / num_vecs_per_head;
extern __shared__ uint8_t smem[];
smem_t k_smem(smem);
uint32_t k_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
wid * 4 + tid / 8, tid % 8); // 4 * 4 per warp
uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
tid % 8 * num_elems_per_128b<CacheT>();
// load k_smem 64 rows 128 cols
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
k_smem.advance_offset_by_column<8, num_vecs_per_head>(k_smem_offset_w, fy);
k_read_idx += 8 * num_elems_per_128b<CacheT>();
}
k_smem_offset_w =
k_smem.advance_offset_by_row<4 * NUM_WARPS, num_vecs_per_head>(k_smem_offset_w) - 16;
k_read_idx += 4 * NUM_WARPS * HEAD_DIM - 16 * num_elems_per_128b<CacheT>();
}
commit_group();
wait_group<0>();
__syncthreads();
// deal k_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
uint32_t col_idx = fy * 16 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, kv_frag);
// layout
/***
r0c0,r0c1, r0c8,r0c9
r8c0,r8c1, r8c8,r8c9
***/
T *k_tile_ptr0 = k_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;
if (row_idx < end_idx) {
k_tile_ptr0[0] = frag_dq_T[0];
k_tile_ptr0[1] = frag_dq_T[1];
k_tile_ptr0[8] = frag_dq_T[2];
k_tile_ptr0[9] = frag_dq_T[3];
}
if (row_idx + 8 < end_idx) {
k_tile_ptr1[0] = frag_dq_T[4];
k_tile_ptr1[1] = frag_dq_T[5];
k_tile_ptr1[8] = frag_dq_T[6];
k_tile_ptr1[9] = frag_dq_T[7];
}
k_smem_offset_r = k_smem.advance_offset_by_column<2, num_vecs_per_head>(
k_smem_offset_r, fy);
}
k_smem_offset_r =
k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head>(k_smem_offset_r) - 16;
}
// ================v================
smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
wid * 4 + tid / 8, tid % 8); // 4 * 4 per warp
uint32_t v_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head, inv_kv_stride>(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
uint32_t v_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
tid % 8 * num_elems_per_128b<CacheT>();
// load v_smem 64 rows 128 cols
for (int fz = 0; fz < 4; fz++) { // // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
v_smem.advance_offset_by_column<8, num_vecs_per_head>(v_smem_offset_w, fy);
v_read_idx += 8 * num_elems_per_128b<CacheT>();
}
v_smem_offset_w =
v_smem.advance_offset_by_row<4 * NUM_WARPS, num_vecs_per_head>(v_smem_offset_w) - 16;
v_read_idx += 4 * NUM_WARPS * HEAD_DIM - 16 * num_elems_per_128b<CacheT>();
}
commit_group();
wait_group<0>();
__syncthreads();
// deal v_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
uint32_t col_idx = fy * 16 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, kv_frag);
// layout
/***
r0c0,r0c1, r0c8,r0c9
r8c0,r8c1, r8c8,r8c9
***/
T *v_tile_ptr0 = v_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
T *v_tile_ptr1 = v_tile_ptr0 + 8 * kv_t_stride;
if (row_idx < end_idx) {
v_tile_ptr0[0] = frag_dq_T[0];
v_tile_ptr0[1] = frag_dq_T[1];
v_tile_ptr0[8] = frag_dq_T[2];
v_tile_ptr0[9] = frag_dq_T[3];
}
if (row_idx + 8 < end_idx) {
v_tile_ptr1[0] = frag_dq_T[4];
v_tile_ptr1[1] = frag_dq_T[5];
v_tile_ptr1[8] = frag_dq_T[6];
v_tile_ptr1[9] = frag_dq_T[7];
}
v_smem_offset_r = v_smem.advance_offset_by_column<2, num_vecs_per_head>(
v_smem_offset_r, fy);
}
v_smem_offset_r =
v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head>(v_smem_offset_r) - 16;
}
}
template <typename T,
typename CacheT,
uint32_t HEAD_DIM,
uint32_t BLOCK_SIZE,
uint32_t NUM_WARPS=4,
bool IS_FP8=false>
__global__ void append_cache_kv_c8(
__global__ void append_dequant_cache_kv_c8(
const CacheT *__restrict__ cache_k,
const CacheT *__restrict__ cache_v,
T *__restrict__ k_out,
@@ -345,16 +169,16 @@ __global__ void append_cache_kv_c8(
const int *tile_ids_per_batch,
const int max_blocks_per_seq,
const int kv_num_heads) {
// start_kv_idx: start kv_idx current block
// batch_idblock's batch_id
// TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with templateint8/fp8)
// start_kv_idx: 每个block的起始kv_idx
// batch_id每个block属于的batch
// TODO: 1.scale预取 2.frag_dq_T复用 3.流水线编排 4.store访存合并 5.cacheT支持int8/fp8)
const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
const uint32_t tid = threadIdx.x, wid = threadIdx.y;
const uint32_t batch_id = batch_ids[tile_idx];
const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
if (seq_lens_this_time[batch_id] <= 0) {
if (seq_lens_this_time <= 0) {
return;
}
@@ -368,8 +192,8 @@ __global__ void append_cache_kv_c8(
// k_out v_out idx
uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride; // 当前k block起始指针
T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride; // 当前v block起始指针
uint32_t k_frag[4], v_frag[4], frag_dq[4];
T *frag_dq_T = reinterpret_cast<T *>(frag_dq);
@@ -390,13 +214,13 @@ __global__ void append_cache_kv_c8(
uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
tid % 8 * num_elems_per_128b<CacheT>();
// load v_smem 64 rows, 128 cols
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 noce, need 1 iter
// load k_smem 行是64 列是128
for (int fz = 0; fz < 4; fz++) { // 每个warp1次4行,循环4次16行,4个warp64行
for (int fy = 0; fy < 1; fy++) { // 一次8个128b = 128uint8
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
@@ -411,13 +235,13 @@ __global__ void append_cache_kv_c8(
wait_group<0>();
__syncthreads();
// deal k_smem 64 rows, 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
// deal k_smem 行是64 列是128
for (int fz = 0; fz < 1; fz++) { // 每个warp1次16行,4个warp64行
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 noce, need 4 iter
for (int fy = 0; fy < 4; fy++) { // 1次2个128b(32个uint8),4次循环8个128b128个uint8
uint32_t col_idx = fy * 32 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
// layout
// 反量化 存储
/***
r0c0,r0c1,r0c8,r0c9, r8c0,r8c1,r8c8,r8c9
r0c16,r0c17,r0c24,r0c25, r8c16,r8c17,r8c24,r8c25
@@ -427,7 +251,8 @@ __global__ void append_cache_kv_c8(
T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;
if (row_idx < end_idx) {
convert_c8<T,IS_FP8>(frag_dq_T,k_frag[2 * i]); // 4 * uint8/fp8 -> 4 * T
convert_c8<T,IS_FP8>(frag_dq_T,k_frag[2 * i]); // 4uint8/fp8 -> 4T
k_tile_ptr0[0] = frag_dq_T[0] * cache_k_scale;
k_tile_ptr0[1] = frag_dq_T[1] * cache_k_scale;
k_tile_ptr0[8] = frag_dq_T[2] * cache_k_scale;
@@ -435,7 +260,8 @@ __global__ void append_cache_kv_c8(
}
if (row_idx + 8 < end_idx) {
convert_c8<T,IS_FP8>(frag_dq_T + 4,k_frag[2 * i + 1]); // 4 * uint8/fp8 -> 4 * T
convert_c8<T,IS_FP8>(frag_dq_T + 4,k_frag[2 * i + 1]); // 4uint8/fp8 -> 4T
k_tile_ptr1[0] = frag_dq_T[4] * cache_k_scale;
k_tile_ptr1[1] = frag_dq_T[5] * cache_k_scale;
k_tile_ptr1[8] = frag_dq_T[6] * cache_k_scale;
@@ -449,8 +275,8 @@ __global__ void append_cache_kv_c8(
k_smem_offset_r =
k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_r) - 8;
}
// ================v================
smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
wid * 8 + tid / 4, tid % 4); // 4 * 8 per warp
@@ -460,9 +286,9 @@ __global__ void append_cache_kv_c8(
uint32_t v_read_idx = (wid * 8 + tid / 4) * BLOCK_SIZE +
tid % 4 * num_elems_per_128b<CacheT>();
// load v_smem 128 rows 64 cols
for (int fy = 0; fy < 4; fy++) { // 8 rows pre warp once, 32 rows all 4 warps once, need 4 iter
for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 noce, need 1 iter
// load v_smem 行是128 列是64
for (int fy = 0; fy < 4; fy++) { // 每个warp1次8行,循环4次32行,4个warp128行
for (int fz = 0; fz < 1; fz++) { // 一次4个128b = 64uint8
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
@@ -478,32 +304,42 @@ __global__ void append_cache_kv_c8(
wait_group<0>();
__syncthreads();
// deal v_smem 128 rows 64 cols
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
// deal v_smem 行是128 列是64 row_idx是head_dim, col_idx是block_size
for (int fy = 0; fy < 2; fy++) { // 每个warp1次16行,循环2次32行4个warp128行
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 noce, need 2 iter
for (int fz = 0; fz < 2; fz++) { // 1次2个128b(32个uint8),2次循环4个128b64个uint8
uint32_t kv_idx = fz * 32 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
// layout
// 反量化 存储
for (int i = 0; i < 4 / 2; i++) {
T *v_tile_ptr0 = v_write_ptr + kv_idx * kv_t_stride + kv_head_idx * HEAD_DIM + dim_idx;
T *v_tile_ptr1 = v_tile_ptr0 + 8;
convert_c8<T,IS_FP8>(frag_dq_T, v_frag[2 * i]); // 4 * uint8/fp8 -> 4 * T
convert_c8<T,IS_FP8>(frag_dq_T + 4, v_frag[2 * i + 1]); // 4 * uint8/fp8 -> 4 * T
if (kv_idx < end_idx) {
convert_c8<T,IS_FP8>(frag_dq_T, v_frag[2 * i]); // 4个uint8/fp8 -> 4个T
#ifdef C8_DEBUG
if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) {
printf("1.fy: %d, fz:%d, row_idx: %d, col_idx: %d, v_frag: %.f, %.f, %.f, %.f \n",
fy, fz, kv_idx, dim_idx, static_cast<float>(frag_dq_T[0]), static_cast<float>(frag_dq_T[1]),
static_cast<float>(frag_dq_T[2]), static_cast<float>(frag_dq_T[3]));
}
#endif
v_tile_ptr0[0] = frag_dq_T[0] * cache_v_scale;
v_tile_ptr1[0] = frag_dq_T[4] * cache_v_scale;
}
if (kv_idx + 1 < end_idx) {
v_tile_ptr0[kv_t_stride] = frag_dq_T[1] * cache_v_scale;
v_tile_ptr1[kv_t_stride] = frag_dq_T[5] * cache_v_scale;
}
if (kv_idx + 8 < end_idx) {
v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale;
v_tile_ptr1[8 * kv_t_stride] = frag_dq_T[6] * cache_v_scale;
}
if (kv_idx + 9 < end_idx) {
v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale;
convert_c8<T,IS_FP8>(frag_dq_T + 4, v_frag[2 * i + 1]); // 4个uint8/fp8 -> 4个T
#ifdef C8_DEBUG
if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) {
printf("2.fy: %d, fz:%d, row_idx: %d, col_idx: %d, v_frag: %.f, %.f, %.f, %.f \n",
fy, fz, kv_idx, dim_idx + 8, static_cast<float>(frag_dq_T[4]), static_cast<float>(frag_dq_T[5]),
static_cast<float>(frag_dq_T[6]), static_cast<float>(frag_dq_T[7]));
}
#endif
v_tile_ptr1[0] = frag_dq_T[4] * cache_v_scale;
v_tile_ptr1[kv_t_stride] = frag_dq_T[5] * cache_v_scale;
v_tile_ptr1[8 * kv_t_stride] = frag_dq_T[6] * cache_v_scale;
v_tile_ptr1[9 * kv_t_stride] = frag_dq_T[7] * cache_v_scale;
}
kv_idx += 16;
@@ -516,250 +352,12 @@ __global__ void append_cache_kv_c8(
}
}
template <typename T,
typename CacheT,
uint32_t HEAD_DIM,
uint32_t BLOCK_SIZE,
uint32_t NUM_WARPS=4>
__global__ void append_cache_kv_c4(
const CacheT *__restrict__ cache_k,
const CacheT *__restrict__ cache_v,
T *__restrict__ k_out,
T *__restrict__ v_out,
const T *__restrict__ cache_k_dequant_scales,
const T *__restrict__ cache_v_dequant_scales,
const T *__restrict__ cache_k_zero_point,
const T *__restrict__ cache_v_zero_point,
const int *__restrict__ seq_lens_this_time,
const int *__restrict__ seq_lens_decoder,
const int *__restrict__ cu_seqlens_k,
const int *__restrict__ block_tables,
const int *batch_ids,
const int *tile_ids_per_batch,
const int max_blocks_per_seq,
const int kv_num_heads) {
// start_kv_idx: start kv_idx current block
// batch_idblock's batch_id
// TODO: 1.scale preload 2.frag_dq_T reuse 3.pipeline 4.store aligned 5.cacheT with templateint8/fp8)
const uint32_t tile_idx = blockIdx.x, kv_head_idx = blockIdx.z;
const uint32_t tid = threadIdx.x, wid = threadIdx.y;
const uint32_t batch_id = batch_ids[tile_idx];
const uint32_t start_kv_idx = tile_ids_per_batch[tile_idx] * BLOCK_SIZE;
const uint32_t end_idx = seq_lens_decoder[batch_id] - start_kv_idx;
if (seq_lens_this_time[batch_id] <= 0) {
return;
}
const int *cur_block_table = block_tables + batch_id * max_blocks_per_seq;
uint32_t block_id = cur_block_table[start_kv_idx / BLOCK_SIZE];
if (block_id < 0) block_id = 0;
constexpr uint32_t HEAD_DIM_HALF = HEAD_DIM / 2;
constexpr uint32_t BLOCK_SIZE_HALF = BLOCK_SIZE / 2;
// cache_kv idx
uint32_t kv_h_stride = BLOCK_SIZE * HEAD_DIM_HALF;
uint32_t block_stride = kv_num_heads * kv_h_stride;
const CacheT *cur_cache_k = cache_k + block_id * block_stride + kv_head_idx * kv_h_stride;
const CacheT *cur_cache_v = cache_v + block_id * block_stride + kv_head_idx * kv_h_stride;
// k_out v_out idx
uint32_t kv_t_stride = kv_num_heads * HEAD_DIM;
T *k_write_ptr = k_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
T *v_write_ptr = v_out + (cu_seqlens_k[batch_id] + start_kv_idx) * kv_t_stride;
extern __shared__ uint8_t smem[];
uint32_t k_frag[4], v_frag[4], frag_dq[8];
T *frag_dq_T = reinterpret_cast<T *>(frag_dq);
// load dequant scales and zero points
const T *cache_k_scale_now = cache_k_dequant_scales + kv_head_idx * HEAD_DIM;
const T *cache_k_zp_now = cache_k_zero_point + kv_head_idx * HEAD_DIM;
const T *cache_v_scale_now = cache_v_dequant_scales + kv_head_idx * HEAD_DIM;
const T *cache_v_zp_now = cache_v_zero_point + kv_head_idx * HEAD_DIM;
T *cache_k_scale_smem = reinterpret_cast<T *>(
smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT));
T *cache_k_zero_point_smem = cache_k_scale_smem + HEAD_DIM;
T *cache_v_scale_smem = cache_k_zero_point_smem + HEAD_DIM;
T *cache_v_zero_point_smem = cache_v_scale_smem + HEAD_DIM;
#pragma unroll
for (uint32_t i = wid * 32 + tid; i < HEAD_DIM; i += 128) {
cache_k_scale_smem[i] = cache_k_scale_now[i];
cache_k_zero_point_smem[i] = cache_k_zp_now[i] + static_cast<T>(136.f);
cache_v_scale_smem[i] = cache_v_scale_now[i];
cache_v_zero_point_smem[i] = cache_v_zp_now[i] + static_cast<T>(136.f);
}
smem_t k_smem(smem);
constexpr uint32_t num_vecs_per_head_k =
HEAD_DIM_HALF / num_elems_per_128b<CacheT>(); // 2
constexpr uint32_t num_vecs_per_blocksize =
BLOCK_SIZE_HALF / num_elems_per_128b<CacheT>();
constexpr uint32_t inv_k_stride = 8 / num_vecs_per_head_k; // 4
constexpr uint32_t inv_v_stride = 8 / num_vecs_per_blocksize;
uint32_t k_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
wid * 8 + tid / 4, tid % 4); // 2(iter) * 4(warp) * 8 row per warp
uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head_k, inv_k_stride>(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8); //
uint32_t k_read_idx = (wid * 8 + tid / 4) * HEAD_DIM / 2 +
tid % 4 * num_elems_per_128b<CacheT>();
// load k_smem 64 rows 128 cols
for (int fz = 0; fz < 2; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 noce, need 1 iter
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
k_smem.advance_offset_by_column<4, num_vecs_per_head_k>(k_smem_offset_w, fy);
k_read_idx += 4 * num_elems_per_128b<CacheT>();
}
k_smem_offset_w =
k_smem.advance_offset_by_row<8 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_w) - 4;
k_read_idx += 8 * NUM_WARPS * HEAD_DIM / 2 - 4 * num_elems_per_128b<CacheT>();
}
commit_group();
wait_group<0>();
__syncthreads();
// deal k_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 noce, need 2 iter
uint32_t col_idx = fy * 64 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
for (int i = 0; i < 2; i++) {
T *k_tile_ptr0 = k_write_ptr + row_idx * kv_t_stride + kv_head_idx * HEAD_DIM + col_idx;
T *k_tile_ptr1 = k_tile_ptr0 + 8 * kv_t_stride;
convert_int4(frag_dq_T, k_frag[2 * i]);
convert_int4(frag_dq_T + 8, k_frag[2 * i + 1]);
if (row_idx < end_idx) {
k_tile_ptr0[0] = (frag_dq_T[0] - cache_k_zero_point_smem[col_idx]) * cache_k_scale_smem[col_idx];
k_tile_ptr0[1] = (frag_dq_T[1] - cache_k_zero_point_smem[col_idx + 1]) * cache_k_scale_smem[col_idx + 1];
k_tile_ptr0[8] = (frag_dq_T[2] - cache_k_zero_point_smem[col_idx + 8]) * cache_k_scale_smem[col_idx + 8];
k_tile_ptr0[9] = (frag_dq_T[3] - cache_k_zero_point_smem[col_idx + 9]) * cache_k_scale_smem[col_idx + 9];
k_tile_ptr0[16] = (frag_dq_T[8] - cache_k_zero_point_smem[col_idx + 16]) * cache_k_scale_smem[col_idx + 16];
k_tile_ptr0[17] = (frag_dq_T[9] - cache_k_zero_point_smem[col_idx + 17]) * cache_k_scale_smem[col_idx + 17];
k_tile_ptr0[24] = (frag_dq_T[10] - cache_k_zero_point_smem[col_idx + 24]) * cache_k_scale_smem[col_idx + 24];
k_tile_ptr0[25] = (frag_dq_T[11] - cache_k_zero_point_smem[col_idx + 25]) * cache_k_scale_smem[col_idx + 25];
}
if (row_idx + 8 < end_idx) {
k_tile_ptr1[0] = (frag_dq_T[4] - cache_k_zero_point_smem[col_idx]) * cache_k_scale_smem[col_idx];
k_tile_ptr1[1] = (frag_dq_T[5] - cache_k_zero_point_smem[col_idx + 1]) * cache_k_scale_smem[col_idx + 1];
k_tile_ptr1[8] = (frag_dq_T[6] - cache_k_zero_point_smem[col_idx + 8]) * cache_k_scale_smem[col_idx + 8];
k_tile_ptr1[9] = (frag_dq_T[7] - cache_k_zero_point_smem[col_idx + 9]) * cache_k_scale_smem[col_idx + 9];
k_tile_ptr1[16] = (frag_dq_T[12] - cache_k_zero_point_smem[col_idx + 16]) * cache_k_scale_smem[col_idx + 16];
k_tile_ptr1[17] = (frag_dq_T[13] - cache_k_zero_point_smem[col_idx + 17]) * cache_k_scale_smem[col_idx + 17];
k_tile_ptr1[24] = (frag_dq_T[14] - cache_k_zero_point_smem[col_idx + 24]) * cache_k_scale_smem[col_idx + 24];
k_tile_ptr1[25] = (frag_dq_T[15] - cache_k_zero_point_smem[col_idx + 25]) * cache_k_scale_smem[col_idx + 25];
}
col_idx += 32;
}
k_smem_offset_r = k_smem.advance_offset_by_column<2, num_vecs_per_head_k>(
k_smem_offset_r, fy);
}
k_smem_offset_r =
k_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_head_k>(k_smem_offset_r) - 4;
}
// ================v================
smem_t v_smem(smem + BLOCK_SIZE * HEAD_DIM * sizeof(CacheT) / 2);
uint32_t v_smem_offset_w = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
wid * 16 + tid / 2, tid % 2); // 4 * 8 per warp
uint32_t v_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_blocksize, inv_v_stride>(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
uint32_t v_read_idx = (wid * 16 + tid / 2) * BLOCK_SIZE_HALF +
tid % 2 * num_elems_per_128b<CacheT>();
// load v_smem 128 rows 64 rows
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
v_smem.advance_offset_by_column<2, num_vecs_per_blocksize>(v_smem_offset_w, fz);
v_read_idx += 2 * num_elems_per_128b<CacheT>();
}
v_smem_offset_w =
v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_blocksize>(v_smem_offset_w) - 2;
v_read_idx += 16 * NUM_WARPS * BLOCK_SIZE_HALF - 2 * num_elems_per_128b<CacheT>();
}
commit_group();
wait_group<0>();
__syncthreads();
// deal v_smem 128 rows 64 cols
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
uint32_t kv_idx = fz * 64 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
// layout
for (int i = 0; i < 2; i++) {
T *v_tile_ptr0 = v_write_ptr + kv_idx * kv_t_stride + kv_head_idx * HEAD_DIM + dim_idx;
T *v_tile_ptr1 = v_tile_ptr0 + 8;
convert_int4(frag_dq_T, v_frag[2 * i]);
convert_int4(frag_dq_T + 8, v_frag[2 * i + 1]);
if (kv_idx < end_idx) {
v_tile_ptr0[0] = (frag_dq_T[0] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[0] = (frag_dq_T[4] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 1 < end_idx) {
v_tile_ptr0[kv_t_stride] = (frag_dq_T[1] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[kv_t_stride] = (frag_dq_T[5] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 8 < end_idx) {
v_tile_ptr0[8 * kv_t_stride] = (frag_dq_T[2] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[8 * kv_t_stride] = (frag_dq_T[6] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 9 < end_idx) {
v_tile_ptr0[9 * kv_t_stride] = (frag_dq_T[3] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[9 * kv_t_stride] = (frag_dq_T[7] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 16 < end_idx) {
v_tile_ptr0[16 * kv_t_stride] = (frag_dq_T[8] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[16 * kv_t_stride] = (frag_dq_T[12] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 17 < end_idx) {
v_tile_ptr0[17 * kv_t_stride] = (frag_dq_T[9] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[17 * kv_t_stride] = (frag_dq_T[13] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 24 < end_idx) {
v_tile_ptr0[24 * kv_t_stride] = (frag_dq_T[10] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[24 * kv_t_stride] = (frag_dq_T[14] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
if (kv_idx + 25 < end_idx) {
v_tile_ptr0[25 * kv_t_stride] = (frag_dq_T[11] - cache_v_zero_point_smem[dim_idx]) * cache_v_scale_smem[dim_idx];
v_tile_ptr1[25 * kv_t_stride] = (frag_dq_T[15] - cache_v_zero_point_smem[dim_idx + 8]) * cache_v_scale_smem[dim_idx + 8];
}
kv_idx += 32;
}
v_smem_offset_r = v_smem.advance_offset_by_column<2, num_vecs_per_blocksize>(
v_smem_offset_r, fz);
}
v_smem_offset_r =
v_smem.advance_offset_by_row<16 * NUM_WARPS, num_vecs_per_blocksize>(v_smem_offset_r) - 2;
}
}
template <typename T, uint32_t HEAD_DIM, uint32_t BLOCK_SIZE>
void AppendCacheKV(
void AppendDequantCache(
const paddle::Tensor &cache_k,
const paddle::Tensor &cache_v,
const paddle::Tensor &cache_k_dequant_scales,
const paddle::Tensor &cache_v_dequant_scales,
const paddle::Tensor &cache_k_zp,
const paddle::Tensor &cache_v_zp,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &cu_seqlens_k,
@@ -773,41 +371,19 @@ void AppendCacheKV(
paddle::Tensor *k_out,
paddle::Tensor *v_out,
const cudaStream_t& stream
) {
) {
using NV_TYPE = typename cascade_attn_type_traits<T>::type;
constexpr int NUM_WARPS = 4;
int block_num = cache_num_blocks_x.data<int>()[0];
dim3 grids(block_num, 1, kv_num_heads);
dim3 blocks(32, NUM_WARPS);
if (cache_quant_type == "none") {
const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(T) * 2;
auto kernel_func = append_cache_kv_c16<NV_TYPE, NV_TYPE, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(kernel_func,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size);
}
kernel_func<<<grids, blocks, smem_size, stream>>>(
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
reinterpret_cast<NV_TYPE *>(k_out->data<T>()),
reinterpret_cast<NV_TYPE *>(v_out->data<T>()),
seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(),
cu_seqlens_k.data<int>(),
block_tables.data<int>(),
cache_batch_ids.data<int>(),
cache_tile_ids_per_batch.data<int>(),
max_blocks_per_seq,
kv_num_heads
);
} else if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
constexpr int NUM_WARPS = 4;
int block_num = cache_num_blocks_x.data<int>()[0];
dim3 grids(block_num, 1, kv_num_heads);
dim3 blocks(32, NUM_WARPS);
const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) * 2;
auto kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
auto kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, false>;
if (cache_quant_type == "cache_fp8") {
kernel_func = append_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
kernel_func = append_dequant_cache_kv_c8<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS, true>;
}
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(kernel_func,
@@ -830,34 +406,6 @@ void AppendCacheKV(
max_blocks_per_seq,
kv_num_heads
);
} else if (cache_quant_type == "cache_int4_zp") {
const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) + 4 * HEAD_DIM * sizeof(T);
auto kernel_func = append_cache_kv_c4<NV_TYPE, uint8_t, HEAD_DIM, BLOCK_SIZE, NUM_WARPS>;
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(kernel_func,
cudaFuncAttributeMaxDynamicSharedMemorySize,
smem_size);
}
kernel_func<<<grids, blocks, smem_size, stream>>>(
cache_k.data<uint8_t>(),
cache_v.data<uint8_t>(),
reinterpret_cast<NV_TYPE *>(k_out->data<T>()),
reinterpret_cast<NV_TYPE *>(v_out->data<T>()),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_dequant_scales.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_dequant_scales.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_zp.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_zp.data<T>())),
seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(),
cu_seqlens_k.data<int>(),
block_tables.data<int>(),
cache_batch_ids.data<int>(),
cache_tile_ids_per_batch.data<int>(),
max_blocks_per_seq,
kv_num_heads
);
} else {
PADDLE_THROW("%s mode isn't implemented yet", cache_quant_type.c_str());
}
@@ -873,7 +421,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& kv_batch_ids,
const paddle::Tensor& kv_tile_ids,
@@ -901,9 +450,9 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const int token_num = qkv_dims[0];
const int max_blocks_per_seq = block_tables.dims()[1];
const int block_size = key_cache.dims()[2];
const int batch_size = seq_lens_this_time.dims()[0];
const int batch_size = cum_offsets.dims()[0];
const int kv_num_heads = key_cache_dims[1];
const int head_dim = cache_quant_type == "cache_int4_zp" ? key_cache_dims[3] * 2 : key_cache_dims[3];
const int head_dim = key_cache_dims[3];
const int num_heads = qkv_dims[qkv_dims.size() - 1] / head_dim - 2 * kv_num_heads;
const float softmax_scale = 1.f / sqrt(head_dim);
@@ -914,7 +463,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
meta_data.q_num_heads = num_heads;
meta_data.max_blocks_per_seq = max_blocks_per_seq;
meta_data.block_size = block_size;
meta_data.batch_size = seq_lens_this_time.dims()[0];
meta_data.batch_size = cum_offsets.dims()[0];
phi::GPUContext* dev_ctx = static_cast<phi::GPUContext*>(phi::DeviceContextPool::Instance().Get(qkv.place()));
@@ -944,10 +493,9 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
v.data<data_t>(),
qkv.data<data_t>(),
rotary_embs.data<float>(),
batch_id_per_token.data<int>(),
padding_offsets.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
cu_seqlens_q.data<int>(),
cu_seqlens_k.data<int>(),
token_num,
num_heads,
@@ -956,38 +504,13 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
rotary_embs.dims()[2],
head_dim,
stream);
if (token_num < kv_token_num) {
AppendCacheKV<data_t, 128, 64>(
key_cache,
value_cache,
cache_k_dequant_scales.get(),
cache_v_dequant_scales.get(),
cache_k_zp.get(),
cache_v_zp.get(),
seq_lens_this_time,
seq_lens_decoder,
cu_seqlens_k,
block_tables,
cache_batch_ids,
cache_tile_ids,
cache_num_blocks,
max_blocks_per_seq,
kv_num_heads,
cache_quant_type,
&k,
&v,
stream
);
}
// write cache
if (cache_quant_type == "none") {
CascadeAppendWriteCacheKVQKV<data_t>(
meta_data,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
seq_lens_encoder,
seq_lens_decoder,
max_seq_len,
@@ -1004,8 +527,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
cache_v_quant_scales.get(),
seq_lens_this_time,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
kv_batch_ids,
kv_tile_ids,
@@ -1016,32 +539,6 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
stream,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
} else if (cache_quant_type == "cache_int4_zp") {
CascadeAppendWriteCacheKVC4QKV<data_t, 128, 64>(
meta_data,
*const_cast<paddle::Tensor*>(&key_cache),
*const_cast<paddle::Tensor*>(&value_cache),
qkv_out,
cache_k_quant_scales.get(),
cache_v_quant_scales.get(),
cache_k_zp.get(),
cache_v_zp.get(),
seq_lens_this_time,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
block_tables,
kv_batch_ids,
kv_tile_ids,
kv_num_blocks_data,
max_seq_len,
stream,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, cache_fp8, "
"cache_int4_zp]");
}
const char* fmt_write_cache_completed_signal_str = std::getenv("FLAGS_fmt_write_cache_completed_signal");
const char* FLAGS_use_pd_disaggregation_per_chunk = std::getenv("FLAGS_use_pd_disaggregation_per_chunk");
@@ -1062,6 +559,28 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
}
}
}
if (token_num < kv_token_num) {
AppendDequantCache<data_t, 128, 64>(
key_cache,
value_cache,
cache_k_dequant_scales.get(),
cache_v_dequant_scales.get(),
seq_lens_this_time,
seq_lens_decoder,
cu_seqlens_k,
block_tables,
cache_batch_ids,
cache_tile_ids,
cache_num_blocks,
max_blocks_per_seq,
kv_num_heads,
cache_quant_type,
&k,
&v,
stream
);
}
return {q, k, v, qkv_out};
}
@@ -1075,7 +594,8 @@ PD_BUILD_STATIC_OP(gqa_rope_write_cache)
"seq_lens_this_time",
"seq_lens_encoder",
"seq_lens_decoder",
"batch_id_per_token",
"padding_offsets",
"cum_offsets",
"block_tables",
"kv_batch_ids",
"kv_tile_ids_per_batch",

View File

@@ -13,7 +13,6 @@
// limitations under the License.
#pragma once
#include "helper.h"
#include "mla_cache_kernel.cuh"
template <paddle::DataType T>
@@ -23,8 +22,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
const paddle::Tensor& kv_pe,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const int max_seq_len,
cudaStream_t& stream,
@@ -54,8 +53,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_decoder.data<int>(),
max_seq_len,
@@ -74,8 +73,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
const paddle::Tensor& kv_cache,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const std::string& cache_quant_type_str,
const int max_seq_len) {
@@ -92,7 +91,7 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
meta_data.max_blocks_per_seq = block_tables.dims()[1];
meta_data.block_size = kv_cache_dims[2];
meta_data.batch_size = seq_lens_decoder.dims()[0];
meta_data.batch_size = cum_offsets.dims()[0];
switch (kv_pe.dtype()) {
case paddle::DataType::BFLOAT16: {
return PrefillMLAWriteCache<paddle::DataType::BFLOAT16>(meta_data,
@@ -100,8 +99,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
kv_pe,
seq_lens,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
max_seq_len,
stream,
@@ -113,8 +112,8 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
kv_pe,
seq_lens,
seq_lens_decoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
max_seq_len,
stream,
@@ -131,8 +130,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
const paddle::Tensor& kv_pe,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const int max_seq_len,
const bool speculate_decoder,
@@ -165,8 +164,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
max_seq_len,
@@ -186,7 +185,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCache(
reinterpret_cast<DataType_*>(const_cast<data_t*>(kv_pe.data<data_t>())),
reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
max_seq_len,
@@ -206,8 +205,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
const paddle::Tensor& kv_cache,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const std::string& cache_quant_type_str,
const int max_seq_len,
@@ -225,7 +224,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
meta_data.max_blocks_per_seq = block_tables.dims()[1];
meta_data.block_size = kv_cache_dims[2];
meta_data.batch_size = seq_lens_encoder.dims()[0];
meta_data.batch_size = cum_offsets.dims()[0];
switch (kv_pe.dtype()) {
case paddle::DataType::BFLOAT16: {
return DecodeMLAWriteCache<paddle::DataType::BFLOAT16>(meta_data,
@@ -233,8 +232,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
kv_pe,
seq_lens,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
max_seq_len,
speculate_decoder,
@@ -247,8 +246,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
kv_pe,
seq_lens,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
block_tables,
max_seq_len,
speculate_decoder,
@@ -260,14 +259,14 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
}
PD_BUILD_STATIC_OP(prefill_mla_write_cache)
PD_BUILD_OP(prefill_mla_write_cache)
.Inputs({"kv_nope",
"kv_pe",
"kv_cache",
"seq_lens",
"seq_lens_decoder",
"batch_id_per_token",
"cu_seqlens_q",
"padding_offsets",
"cum_offsets",
"block_tables"})
.Outputs({"kv_cache_out"})
.SetInplaceMap({{"kv_cache", "kv_cache_out"}})
@@ -275,14 +274,14 @@ PD_BUILD_STATIC_OP(prefill_mla_write_cache)
"max_seq_len: int"})
.SetKernelFn(PD_KERNEL(PrefillMLAWriteCacheKernel));
PD_BUILD_STATIC_OP(decode_mla_write_cache)
PD_BUILD_OP(decode_mla_write_cache)
.Inputs({"kv_nope",
"kv_pe",
"kv_cache",
"seq_lens",
"seq_lens_encoder",
"batch_id_per_token",
"cu_seqlens_q",
"padding_offsets",
"cum_offsets",
"block_tables"})
.Outputs({"kv_cache_out"})
.SetInplaceMap({{"kv_cache", "kv_cache_out"}})

View File

@@ -24,7 +24,7 @@ __global__ void decode_absorb_cache_kernel(
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
// nope_size]
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const int max_seq_len,
@@ -50,7 +50,7 @@ __global__ void decode_absorb_cache_kernel(
linear_index += step) {
const int ori_bi = linear_index / hidden_size;
const int bias = linear_index % hidden_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
if (seq_lens_encoder[ori_bi] > 0) return;
const int write_seq_id = seq_lens[ori_bi];
@@ -95,8 +95,8 @@ __global__ void speculate_decode_absorb_cache_kernel(
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
// nope_size]
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token,
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets,
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const int max_seq_len,
@@ -121,10 +121,10 @@ __global__ void speculate_decode_absorb_cache_kernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_id = linear_index / hidden_size;
const int ori_bi = batch_id_per_token[token_id];
const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
if (seq_lens[ori_bi] == 0) continue;
const int bias = linear_index % hidden_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
const int write_seq_id =
seq_lens[ori_bi] + token_id - start_token_idx;
if (write_seq_id == 0) continue;
@@ -143,7 +143,7 @@ __global__ void speculate_decode_absorb_cache_kernel(
ori_bi,
seq_lens[ori_bi],
token_id,
cu_seqlens_q[ori_bi]);
cum_offsets[ori_bi]);
}
if (bias < nope_hidden_size) { // pe
const uint32_t inner_bias = bias;
@@ -178,8 +178,8 @@ __global__ void prefill_absorb_cache_kernel(
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
// nope_size]
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token,
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets,
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_decoder, // [bsz]
const int max_seq_len,
@@ -204,9 +204,11 @@ __global__ void prefill_absorb_cache_kernel(
linear_index += step) {
const uint32_t token_idx = linear_index / hidden_size;
const uint32_t bias = linear_index % hidden_size;
const uint32_t ori_bi = batch_id_per_token[token_idx];
const uint32_t ori_token_idx = token_idx + padding_offsets[token_idx];
const uint32_t ori_bi = ori_token_idx / max_seq_len;
if (seq_lens[ori_bi] == 0) continue;
const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
const uint32_t ori_seq_id =
ori_token_idx % max_seq_len + seq_lens_decoder[ori_bi];
const int* block_table_now = nullptr;
block_table_now = block_tables + ori_bi * max_blocks_per_seq;

View File

@@ -26,8 +26,8 @@ void DecodeMLAAttentionKernel(
const paddle::optional<paddle::Tensor>& smooth_weight,
const paddle::Tensor &seq_lens_q, // q_seq_len is 1
const paddle::Tensor &seq_lens_kv,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_table,
int max_seq_len,
int max_dec_len,

View File

@@ -26,8 +26,8 @@ __global__ void append_clear_cache_int8_block(
// block_size, head_size // 2]
const int* __restrict__ seq_lens,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens_encoder, // [bsz]
const int max_seq_len,
const int max_blocks_per_seq,
@@ -41,10 +41,10 @@ __global__ void append_clear_cache_int8_block(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
if (seq_lens_encoder[bid] > 0) return;
@@ -100,8 +100,8 @@ __global__ void append_clear_cache_int4_block(
// block_size, head_size // 2]
const int* __restrict__ seq_lens,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens_encoder, // [bsz]
const int max_seq_len,
const int max_blocks_per_seq,
@@ -115,10 +115,10 @@ __global__ void append_clear_cache_int4_block(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
if (seq_lens_encoder[bid] > 0) return;
@@ -178,8 +178,8 @@ __global__ void append_speculate_cache_rope_kernel(
// head_size // 2]
T* __restrict__ q_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens_decoder, // [bsz]
const float* __restrict__ cos_emb,
const float* __restrict__ sin_emb,
@@ -214,12 +214,12 @@ __global__ void append_speculate_cache_rope_kernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_id = linear_index / hidden_size;
const int ori_bi = batch_id_per_token[token_id];
const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % hidden_size;
const int hi = bias / head_size; // q + k + v
const int h_bias = bias % head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
const int write_seq_id =
seq_lens_decoder[ori_bi] + token_id - start_token_idx;
if (write_seq_id == 0) continue;
@@ -235,7 +235,7 @@ __global__ void append_speculate_cache_rope_kernel(
ori_bi,
seq_lens_decoder[ori_bi],
token_id,
cu_seqlens_q[ori_bi]);
cum_offsets[ori_bi]);
}
const int block_offset = write_seq_id % block_size;
@@ -311,8 +311,8 @@ __global__ void append_speculate_cache_neox_rope_kernel(
// head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens_decoder, // [bsz]
const float* __restrict__ cos_emb,
const float* __restrict__ sin_emb,
@@ -347,12 +347,12 @@ __global__ void append_speculate_cache_neox_rope_kernel(
linear_index < elem_cnt;
linear_index += step) {
const int token_id = linear_index / half_hidden_size;
const int ori_bi = batch_id_per_token[token_id];
const int ori_bi = (token_id + padding_offsets[token_id]) / max_seq_len;
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % half_hidden_size;
const int hi = bias / half_head_size; // q + k + v
const int h_bias = bias % half_head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int start_token_idx = ori_bi * max_seq_len - cum_offsets[ori_bi];
const int write_seq_id =
seq_lens_decoder[ori_bi] + token_id - start_token_idx;
if (write_seq_id == 0) continue;
@@ -368,7 +368,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
ori_bi,
seq_lens_decoder[ori_bi],
token_id,
cu_seqlens_q[ori_bi]);
cum_offsets[ori_bi]);
}
const int block_offset = write_seq_id % block_size;
@@ -458,8 +458,8 @@ __global__ void append_speculate_cache_int8_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -484,10 +484,10 @@ __global__ void append_speculate_cache_int8_rope_kernel(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
int q_head_idx, k_head_idx, v_idx;
const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -690,8 +690,8 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -716,10 +716,10 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
int q_head_idx, k_head_idx, v_idx;
@@ -1068,8 +1068,8 @@ __global__ void append_speculate_cache_int4_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -1097,10 +1097,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -1130,10 +1130,6 @@ __global__ void append_speculate_cache_int4_rope_kernel(
LoadOutScaleT out_scale_vec;
LoadEmbT cos_emb_vec;
LoadEmbT sin_emb_vec;
#pragma unroll
for (int v_i = 0; v_i < VecSize; v_i++) {
bias_vec[v_i] = 0;
}
const InT* qkv_now = quant_qkv + token_id * hidden_size;
T* qkv_out_now = qkv_out + token_id * hidden_size;
#pragma unroll
@@ -1141,8 +1137,8 @@ __global__ void append_speculate_cache_int4_rope_kernel(
head_bias += 32 * VecSize) {
const int bias_idx = head_idx * HeadDim + head_bias;
Load<InT, VecSize>(&qkv_now[bias_idx], &src_vec);
// Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
// Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
// q rope
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
@@ -1152,10 +1148,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
// dequant + add_bias + rope
float input_left = static_cast<float>(src_vec[2 * i]);
float input_right = static_cast<float>(src_vec[2 * i + 1]);
// input_left = input_left * out_scale_vec[2 * i] +
// static_cast<float>(bias_vec[2 * i]);
// input_right = input_right * out_scale_vec[2 * i + 1] +
// static_cast<float>(bias_vec[2 * i + 1]);
input_left = input_left * out_scale_vec[2 * i] +
static_cast<float>(bias_vec[2 * i]);
input_right = input_right * out_scale_vec[2 * i + 1] +
static_cast<float>(bias_vec[2 * i + 1]);
const float cos_tmp = cos_emb_vec[i];
const float sin_tmp = sin_emb_vec[i];
bias_vec[2 * i] =
@@ -1171,35 +1167,6 @@ __global__ void append_speculate_cache_int4_rope_kernel(
using LoadPadKVT = AlignedVector<uint8_t, KV_VEC_SIZE>;
const uint32_t kv_head_idx = (head_idx - num_heads) % gqa_group_size;
if (block_offset == 0) {
// pad zero for this kv_head_idx for this block
LoadPadKVT pad_cache_vec;
*(reinterpret_cast<uint4*>(pad_cache_vec.val)) = make_uint4(0, 0, 0, 0);
if (head_idx < num_heads + gqa_group_size) {
constexpr int num_vecs_per_head_dim = half_head_size / KV_VEC_SIZE;
constexpr int num_token_each_time = 32 / num_vecs_per_head_dim;
const uint32_t tgt_idx = (block_idx * gqa_group_size + kv_head_idx) *
block_size * half_head_size +
lane_id % num_vecs_per_head_dim * KV_VEC_SIZE;
for (int block_i = lane_id / num_vecs_per_head_dim;
block_i < block_size;
block_i += num_token_each_time) {
Store<uint8_t, KV_VEC_SIZE>(
pad_cache_vec, &key_cache[tgt_idx + block_i * half_head_size]);
}
} else {
const int num_vecs_per_head_dim = half_block_size / KV_VEC_SIZE;
const int num_token_each_time = 32 / num_vecs_per_head_dim;
const uint32_t tgt_idx = (block_idx * gqa_group_size + kv_head_idx) *
HeadDim * half_block_size +
lane_id % num_vecs_per_head_dim * KV_VEC_SIZE;
for (int block_i = lane_id / num_vecs_per_head_dim; block_i < HeadDim;
block_i += num_token_each_time) {
Store<uint8_t, KV_VEC_SIZE>(
pad_cache_vec, &value_cache[tgt_idx + block_i * half_block_size]);
}
}
}
constexpr int K_VEC_SIZE = 4;
constexpr int HALF_K_VEC_SIZE = 2;
using LoadKVResT = AlignedVector<uint8_t, K_VEC_SIZE>;
@@ -1215,11 +1182,7 @@ __global__ void append_speculate_cache_int4_rope_kernel(
LoadScaleT zp_vec1, zp_vec2;
LoadEmbT cos_emb_vec1, cos_emb_vec2;
LoadEmbT sin_emb_vec1, sin_emb_vec2;
#pragma unroll
for (int v_i = 0; v_i < HALF_K_VEC_SIZE; v_i++) {
bias_vec1[v_i] = 0;
bias_vec2[v_i] = 0;
}
const InT* qkv_now = quant_qkv + token_id * hidden_size;
const int head_bias = lane_id / 4 * 16 + lane_id % 4 * 2;
//////////
@@ -1228,11 +1191,11 @@ __global__ void append_speculate_cache_int4_rope_kernel(
Load<InT, HALF_K_VEC_SIZE>(&qkv_now[bias_idx], &src_vec1);
Load<InT, HALF_K_VEC_SIZE>(&qkv_now[bias_idx + 8], &src_vec2);
/////
// Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx], &bias_vec1);
// Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx + 8], &bias_vec2);
// Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx], &out_scale_vec1);
// Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx + 8],
// &out_scale_vec2);
Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx], &bias_vec1);
Load<T, HALF_K_VEC_SIZE>(&qkv_biases[bias_idx + 8], &bias_vec2);
Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx], &out_scale_vec1);
Load<float, HALF_K_VEC_SIZE>(&qkv_out_scales[bias_idx + 8],
&out_scale_vec2);
if (head_idx < num_heads + gqa_group_size) {
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, 1>(&cos_emb[emb_idx], &cos_emb_vec1);
@@ -1252,10 +1215,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
float input_left = static_cast<float>(src_vec1[0]);
float input_right = static_cast<float>(src_vec1[1]);
// input_left =
// input_left * out_scale_vec1[0] + static_cast<float>(bias_vec1[0]);
// input_right =
// input_right * out_scale_vec1[1] + static_cast<float>(bias_vec1[1]);
input_left =
input_left * out_scale_vec1[0] + static_cast<float>(bias_vec1[0]);
input_right =
input_right * out_scale_vec1[1] + static_cast<float>(bias_vec1[1]);
if (head_idx < num_heads + gqa_group_size) {
float cos_tmp = cos_emb_vec1[0];
float sin_tmp = sin_emb_vec1[0];
@@ -1270,10 +1233,10 @@ __global__ void append_speculate_cache_int4_rope_kernel(
input_left = static_cast<float>(src_vec2[0]);
input_right = static_cast<float>(src_vec2[1]);
// input_left =
// input_left * out_scale_vec2[0] + static_cast<float>(bias_vec2[0]);
// input_right =
// input_right * out_scale_vec2[1] + static_cast<float>(bias_vec2[1]);
input_left =
input_left * out_scale_vec2[0] + static_cast<float>(bias_vec2[0]);
input_right =
input_right * out_scale_vec2[1] + static_cast<float>(bias_vec2[1]);
if (head_idx < num_heads + gqa_group_size) {
float cos_tmp = cos_emb_vec2[0];
float sin_tmp = sin_emb_vec2[0];
@@ -1411,8 +1374,8 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
// block_size, head_size // 2]
T* __restrict__ qkv_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ padding_offsets, // [num_tokens]
const int* __restrict__ cum_offsets,
const int* __restrict__ seq_lens, // [bsz]
const int* __restrict__ seq_lens_encoder, // [bsz]
const float* __restrict__ cos_emb,
@@ -1440,10 +1403,10 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
const int ori_token_id = token_id + padding_offsets[token_id];
const int bid = ori_token_id / max_seq_len;
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
const int start_token_idx = bid * max_seq_len - cum_offsets[bid];
const int head_idx = blockIdx.y * NUM_WARPS + wid;
const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * HeadDim;
@@ -1829,4 +1792,4 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
(uint_quant_value2 << 4) | (uint_quant_value1 & 0x0F);
}
}
}
}

View File

@@ -22,8 +22,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
T* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -59,8 +59,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
cos_emb,
sin_emb,
@@ -82,8 +82,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
cos_emb,
sin_emb,
@@ -106,8 +106,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -136,8 +136,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
seq_lens,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens_encoder,
max_seq_len,
max_blocks_per_seq,
@@ -151,8 +151,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -175,8 +175,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -201,8 +201,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* padding_offsets,
const int* cum_offsets,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
@@ -233,8 +233,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
seq_lens,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens_encoder,
max_seq_len,
max_blocks_per_seq,
@@ -248,8 +248,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -274,8 +274,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
padding_offsets,
cum_offsets,
seq_lens,
seq_lens_encoder,
cos_emb,
@@ -301,8 +301,8 @@ void SpeculateWriteCacheWithRoPEKernel(
const paddle::Tensor& qkv,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -349,8 +349,8 @@ void SpeculateWriteCacheWithRoPEKernel(
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -376,8 +376,8 @@ void SpeculateWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -409,8 +409,8 @@ void SpeculateWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -442,8 +442,8 @@ void SpeculateWriteCacheWithRoPEKernel(
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
padding_offsets.data<int>(),
cum_offsets.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
@@ -488,8 +488,8 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
// gqa_group_size, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -514,8 +514,8 @@ SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
// gqa_group_size, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -539,8 +539,8 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int>(
// gqa_group_size, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -566,8 +566,8 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
// gqa_group_size, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -582,4 +582,4 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out);

View File

@@ -23,8 +23,8 @@ void SpeculateWriteCacheWithRoPEKernel(
// gqa_group_size, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
@@ -39,4 +39,4 @@ void SpeculateWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out);

View File

@@ -37,8 +37,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, int8_t>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -37,8 +37,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -38,8 +38,8 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -85,8 +85,8 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -80,8 +80,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -82,8 +82,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, f
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -82,8 +82,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, t
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -81,8 +81,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -36,8 +36,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
@@ -81,8 +81,8 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,

View File

@@ -22,8 +22,8 @@ EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,

View File

@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,

View File

@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,

View File

@@ -21,8 +21,8 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids,

View File

@@ -30,4 +30,4 @@ inline int getSMVersion()
return sm_major * 10 + sm_minor;
}
}
}

View File

@@ -54,7 +54,7 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor &value_cache, const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &batch_id_per_token, const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &padding_offsets, const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_tables, const paddle::Tensor &encoder_batch_ids,
const paddle::Tensor &encoder_tile_ids_per_batch,
const paddle::Tensor &encoder_num_blocks,
@@ -94,7 +94,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &batch_id_per_token,
const paddle::Tensor &padding_offsets, const paddle::Tensor &cum_offsets,
const paddle::Tensor &block_tables, const paddle::Tensor &kv_batch_ids,
const paddle::Tensor &kv_tile_ids, const paddle::Tensor &kv_num_blocks,
const paddle::Tensor &cache_batch_ids, const paddle::Tensor &cache_tile_ids,
@@ -116,11 +116,11 @@ PreCacheLenConcat(const paddle::Tensor &seq_lens_decoder,
paddle::Tensor FusedExpertMoeFunc(
const paddle::Tensor &input, const paddle::Tensor &gate_weight,
const paddle::Tensor &up_gate_proj_weight, const paddle::Tensor &down_proj_weight,
const paddle::optional<paddle::Tensor> &up_gate_proj_bias,
const paddle::optional<paddle::Tensor> &up_gate_proj_scale,
const paddle::optional<paddle::Tensor> &down_proj_bias,
const paddle::optional<paddle::Tensor> &down_proj_scale,
const paddle::Tensor &ffn1_weight, const paddle::Tensor &ffn2_weight,
const paddle::optional<paddle::Tensor> &ffn1_bias,
const paddle::optional<paddle::Tensor> &ffn1_scale,
const paddle::optional<paddle::Tensor> &ffn2_bias,
const paddle::optional<paddle::Tensor> &ffn2_scale,
const std::string &quant_method, const int moe_topk,
const bool norm_topk_prob, const bool group_moe);
@@ -149,7 +149,7 @@ MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits,
std::vector<paddle::Tensor>
EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids,
const paddle::Tensor &topk_weights,
const paddle::optional<paddle::Tensor> &up_gate_proj_in_scale,
const paddle::optional<paddle::Tensor> &ffn1_in_scale,
const std::vector<int> &token_nums_per_expert,
const int token_nums_this_rank,
const std::string &moe_quant_type);
@@ -158,8 +158,7 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
const paddle::Tensor &input, const paddle::Tensor &scale,
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
const paddle::Tensor &token_nums_per_expert,
const paddle::Tensor &token_nums_per_expert_padded,
const bool use_in_ep, const int token_nums_this_rank_padded);
const paddle::Tensor &token_nums_per_expert_padded);
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
const int block_size);
@@ -173,7 +172,7 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
const paddle::Tensor &ffn_out, const paddle::Tensor &expert_scales_float,
const paddle::Tensor &permute_indices_per_token,
const paddle::Tensor &top_k_indices,
const paddle::optional<paddle::Tensor> &down_proj_bias,
const paddle::optional<paddle::Tensor> &ffn2_bias,
const bool norm_topk_prob, const float routed_scaling_factor);
std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
@@ -182,35 +181,35 @@ std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
paddle::Tensor MoeExpertFFNFunc(
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& up_gate_proj_weight, const paddle::Tensor& down_proj_weight,
const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_in_scale,
const paddle::Tensor& ffn1_weight, const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const paddle::optional<paddle::Tensor>& ffn2_in_scale,
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
const std::string& quant_method, const bool used_in_ep_low_latency);
paddle::Tensor MoeExpertFFNWint2Func(
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& up_gate_proj_weight,
const paddle::Tensor& down_proj_weight,
const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_scale,
const paddle::optional<paddle::Tensor>& up_gate_proj_local_scale,
const paddle::optional<paddle::Tensor>& up_gate_proj_code_scale,
const paddle::optional<paddle::Tensor>& up_gate_proj_code_zp,
const paddle::optional<paddle::Tensor>& down_proj_local_scale,
const paddle::optional<paddle::Tensor>& down_proj_code_scale,
const paddle::optional<paddle::Tensor>& down_proj_code_zp,
const paddle::Tensor& ffn1_weight,
const paddle::Tensor& ffn2_weight,
const paddle::optional<paddle::Tensor>& ffn1_bias,
const paddle::optional<paddle::Tensor>& ffn1_scale,
const paddle::optional<paddle::Tensor>& ffn2_scale,
const paddle::optional<paddle::Tensor>& ffn1_local_scale,
const paddle::optional<paddle::Tensor>& ffn1_code_scale,
const paddle::optional<paddle::Tensor>& ffn1_code_zp,
const paddle::optional<paddle::Tensor>& ffn2_local_scale,
const paddle::optional<paddle::Tensor>& ffn2_code_scale,
const paddle::optional<paddle::Tensor>& ffn2_code_zp,
const bool used_in_ep_low_latency);
paddle::Tensor MoeExpertReduceFunc(
const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight,
const paddle::Tensor &permute_indices_per_token,
const paddle::Tensor &top_k_indices,
const paddle::optional<paddle::Tensor> &down_proj_bias,
const paddle::optional<paddle::Tensor> &ffn2_bias,
const bool norm_topk_prob, const float routed_scaling_factor);
void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor,
@@ -234,15 +233,9 @@ paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
paddle::Tensor &decoder_batch_ids, // Inplace
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int group_size,
const int block_size,
const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &cum_offsets,
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int group_size, const int block_size,
const int decoder_step_token_num);
std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
@@ -272,12 +265,13 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
const paddle::Tensor &seq_lens,
const paddle::Tensor &end_ids,
const paddle::Tensor &next_tokens,
const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx,
const paddle::Tensor &stop_seqs,
const paddle::Tensor &stop_seqs_len,
const bool beam_search);
void GetStopFlagsMultiSeqs(
const paddle::Tensor &topk_ids, const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx, const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens, const paddle::Tensor &stop_seqs,
const paddle::Tensor &stop_seqs_len, const paddle::Tensor &end_ids);
void UpdateInputes(const paddle::Tensor &stop_flags,
const paddle::Tensor &not_need_stop, // only on cpu
@@ -289,32 +283,6 @@ void UpdateInputes(const paddle::Tensor &stop_flags,
const paddle::Tensor &next_tokens,
const paddle::Tensor &is_block_step);
void UpdateInputesV1(const paddle::Tensor &stop_flags,
const paddle::Tensor &not_need_stop, // only on cpu
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &step_seq_lens_decoder,
const paddle::Tensor &prompt_lens,
const paddle::Tensor &topk_ids,
const paddle::Tensor &input_ids,
const paddle::Tensor &block_tables,
const paddle::Tensor &stop_nums,
const paddle::Tensor &next_tokens,
const paddle::Tensor &is_block_step,
const int block_size);
void RecoverDecodeTask(const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &step_seq_lens_decoder,
const paddle::Tensor &block_tables,
const paddle::Tensor &is_block_step,
const int block_size);
paddle::Tensor
GroupSwigluWithMasked(const paddle::Tensor &fc1_out_tensor,
const paddle::Tensor &token_nums_per_expert);
@@ -361,8 +329,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
const paddle::Tensor& kv_cache,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const std::string& cache_quant_type_str,
const int max_seq_len,
@@ -374,8 +342,8 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
const paddle::Tensor& kv_cache,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const std::string& cache_quant_type_str,
const int max_seq_len);
@@ -400,7 +368,8 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& padding_offsets,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& block_tables,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
@@ -499,268 +468,6 @@ std::vector<paddle::Tensor> NoauxTc(
int topk,
float routed_scaling_factor);
#ifdef ENABLE_FP8
paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
const paddle::Tensor& x,
const paddle::Tensor& y,
const paddle::optional<paddle::Tensor>& bias,
bool trans_x,
bool trans_y,
float scale, // only support per-tensor quantization
std::string output_dtype,
std::string activation_type);
paddle::Tensor MoeFusedHadamardQuantFp8Func(
const paddle::Tensor &input,
const paddle::Tensor &scale,
const paddle::Tensor &topk_ids,
const int top_k,
const int intermediate_size,
const bool tiled);
paddle::Tensor FusedHadamardQuantFp8Func(
const paddle::Tensor &input,
const float scale);
#endif
int64_t init_custom_all_reduce(const std::vector<int64_t>& fake_ipc_ptrs,
paddle::Tensor& rank_data, int64_t rank, bool full_nvlink);
void all_reduce(int64_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
int64_t reg_buffer, int64_t reg_buffer_sz_bytes);
void dispose(int64_t _fa);
int64_t meta_size();
void register_buffer(int64_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
std::tuple<std::vector<int64_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(int64_t _fa);
void register_graph_buffers(int64_t _fa,
const std::vector<std::vector<int64_t>>& handles,
const std::vector<std::vector<int64_t>>& offsets);
std::tuple<int64_t, paddle::Tensor> allocate_shared_buffer_and_handle(
int64_t size);
int64_t open_mem_handle(paddle::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);
// speculative decoding Kernel
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
const paddle::Tensor& input_ids,
const paddle::Tensor& draft_tokens,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& token_num,
const paddle::Tensor& seq_len,
const paddle::Tensor& seq_lens_encoder);
std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder);
std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
const paddle::Tensor& output_cum_offsets_tmp,
const paddle::Tensor& out_token_num,
const paddle::Tensor& seq_lens_output,
const int max_seq_len);
void SpecTokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
const paddle::Tensor &logits,
const paddle::Tensor &penalty_scores,
const paddle::Tensor &frequency_scores,
const paddle::Tensor &presence_scores,
const paddle::Tensor &temperatures,
const paddle::Tensor &bad_tokens,
const paddle::Tensor &cur_len,
const paddle::Tensor &min_len,
const paddle::Tensor &eos_token_id,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &output_padding_offset,
const paddle::Tensor &output_cum_offsets,
const int max_seq_len);
void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
const paddle::Tensor &accept_num,
const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx,
const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens,
const paddle::Tensor &stop_seqs,
const paddle::Tensor &stop_seqs_len,
const paddle::Tensor &end_ids);
void SpeculateVerify(
const paddle::Tensor &accept_tokens, const paddle::Tensor &accept_num,
const paddle::Tensor &step_idx, const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &draft_tokens,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &verify_tokens, const paddle::Tensor &verify_scores,
const paddle::Tensor &max_dec_len, const paddle::Tensor &end_tokens,
const paddle::Tensor &is_block_step,
const paddle::Tensor &output_cum_offsets,
const paddle::Tensor &actual_candidate_len,
const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp,
int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode);
void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &not_need_stop,
const paddle::Tensor &draft_tokens,
const paddle::Tensor &actual_draft_token_nums,
const paddle::Tensor &accept_tokens,
const paddle::Tensor &accept_num,
const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &is_block_step,
const paddle::Tensor &stop_nums);
void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
const paddle::Tensor &accept_tokens,
const paddle::Tensor &accept_num,
const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &step_idx);
void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
const paddle::Tensor& accept_num,
const paddle::Tensor& not_need_stop,
int64_t rank_id,
bool save_each_rank);
void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
const paddle::Tensor& seq_lens_decoder);
void NgramMatch(const paddle::Tensor &input_ids,
const paddle::Tensor &input_ids_len,
const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx,
const paddle::Tensor &draft_token_num,
const paddle::Tensor &draft_tokens,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &max_dec_len,
const int max_ngram_size,
const int max_draft_tokens);
// MTP
void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
const paddle::Tensor& base_model_seq_lens_this_time,
const paddle::Tensor& base_model_seq_lens_encoder,
const paddle::Tensor& base_model_stop_flags);
void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
const paddle::Tensor& input_ids,
const paddle::Tensor& stop_flags,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& not_need_stop,
const paddle::Tensor& batch_drop,
const paddle::Tensor& accept_tokens,
const paddle::Tensor& accept_num,
const paddle::Tensor& base_model_seq_lens_encoder,
const paddle::Tensor& base_model_seq_lens_decoder,
const paddle::Tensor& base_model_step_idx,
const paddle::Tensor& base_model_stop_flags,
const paddle::Tensor& base_model_is_block_step,
const paddle::Tensor& base_model_draft_tokens,
const int max_draft_token,
const bool truncate_first_token,
const bool splitwise_prefill);
void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
const paddle::Tensor& draft_tokens,
const paddle::Tensor& pre_ids,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& output_cum_offsets,
const paddle::Tensor& stop_flags,
const paddle::Tensor& not_need_stop,
const paddle::Tensor& max_dec_len,
const paddle::Tensor& end_ids,
const paddle::Tensor& base_model_draft_tokens,
const int max_seq_len,
const int substep);
std::vector<paddle::Tensor> EagleGetHiddenStates(
const paddle::Tensor& input,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& stop_flags,
const paddle::Tensor& accept_nums,
const paddle::Tensor& base_model_seq_lens_this_time,
const paddle::Tensor& base_model_seq_lens_encoder,
const int actual_draft_token_num);
std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
const paddle::Tensor& input,
const paddle::Tensor& last_seq_lens_this_time,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& step_idx);
void MTPStepPaddle(
const paddle::Tensor &base_model_stop_flags,
const paddle::Tensor &stop_flags,
const paddle::Tensor &batch_drop,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &block_tables, // [bsz, block_num_per_seq]
const paddle::Tensor &encoder_block_lens,
const paddle::Tensor &used_list_len,
const paddle::Tensor &free_list,
const paddle::Tensor &free_list_len,
const int block_size,
const int max_draft_tokens);
void SpeculateStepPaddle(
const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &ori_seq_lens_encoder,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &block_tables, // [bsz, block_num_per_seq]
const paddle::Tensor &encoder_block_lens,
const paddle::Tensor &is_block_step,
const paddle::Tensor &step_block_list,
const paddle::Tensor &step_lens,
const paddle::Tensor &recover_block_list,
const paddle::Tensor &recover_lens,
const paddle::Tensor &need_block_list,
const paddle::Tensor &need_block_len,
const paddle::Tensor &used_list_len,
const paddle::Tensor &free_list,
const paddle::Tensor &free_list_len,
const paddle::Tensor &input_ids,
const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx,
const paddle::Tensor &next_tokens,
const paddle::Tensor &first_token_ids,
const paddle::Tensor &accept_num,
const int block_size,
const int encoder_decoder_block_num,
const int max_draft_tokens);
PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -770,7 +477,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
* moe/fused_moe/moe_redundant_topk_select.cu
* moe_redundant_topk_select
*/
m.def("moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
m.def("f_moe_redundant_topk_select", &MoERedundantTopKSelectKernel,
py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"),
py::arg("expert_in_rank_num_list"),
py::arg("tokens_per_expert_stats_list"), py::arg("bias"),
@@ -852,7 +559,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
* ep_moe_dispatch
*/
m.def("ep_moe_expert_dispatch", &EPMoeExpertDispatch, py::arg("input"),
py::arg("topk_ids"), py::arg("topk_weights"), py::arg("up_gate_proj_in_scale"),
py::arg("topk_ids"), py::arg("topk_weights"), py::arg("ffn1_in_scale"),
py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"),
py::arg("moe_quant_type"), "ep moe export dispatch function");
@@ -860,7 +567,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"),
py::arg("expert_scales_float"), py::arg("permute_indices_per_token"),
py::arg("top_k_indices"), py::arg("down_proj_bias"),
py::arg("top_k_indices"), py::arg("ffn2_bias"),
py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"),
"ep moe export combine function");
@@ -902,7 +609,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
*/
m.def("moe_expert_reduce", &MoeExpertReduceFunc, py::arg("ffn_out"),
py::arg("top_k_weight"), py::arg("permute_indices_per_token"),
py::arg("top_k_indices"), py::arg("down_proj_bias"),
py::arg("top_k_indices"), py::arg("ffn2_bias"),
py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"),
"moe export reduce function");
@@ -930,8 +637,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
* append_attn/get_block_shape_and_split_kv_block.cu
* get_block_shape_and_split_kv_block
*/
m.def("get_block_shape_and_split_kv_block",
&GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block function");
// m.def("f_get_block_shape_and_split_kv_block",
// &GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block
// function");
/**
* get_padding_offset.cu
@@ -959,6 +667,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("set_stop_value_multi_ends", &GetStopFlagsMulti,
"update_inputs function");
/**
* stop_generation_multi_stop_seqs.cu
* set_stop_value_multi_seqs
*/
m.def("set_stop_value_multi_seqs", &GetStopFlagsMultiSeqs,
"update_inputs function");
/**
* update_inputs.cu
@@ -966,18 +680,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
*/
m.def("update_inputs", &UpdateInputes, "update_inputs function");
/**
* update_inputs_v1.cu
* update_inputs_v1
*/
m.def("update_inputs_v1", &UpdateInputesV1, "update inputs for scheduler v1 function");
/**
* recover_decode_task.cu
* recover_decode_task
*/
m.def("recover_decode_task", &RecoverDecodeTask, "recover decode task for scheduler v1 function");
/**
* extract_text_token_output.cu
* extract_text_token_output
@@ -998,17 +700,35 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"),
py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"),
py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"),
py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"),
py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"), py::arg("b_q_type_str"),
py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"),
py::arg("use_fp32_reduce"), py::arg("is_zp_float"));
py::arg("a"),
py::arg("c_or_none"),
py::arg("b_q_weight"),
py::arg("b_scales"),
py::arg("global_scale_or_none"),
py::arg("b_zeros_or_none"),
py::arg("g_idx_or_none"),
py::arg("perm_or_none"),
py::arg("workspace"),
py::arg("sorted_token_ids"),
py::arg("expert_ids"),
py::arg("num_tokens_post_padded"),
py::arg("topk_weights"),
py::arg("moe_block_size"),
py::arg("top_k"),
py::arg("mul_topk_weights"),
py::arg("is_ep"),
py::arg("b_q_type_str"),
py::arg("size_m"),
py::arg("size_n"),
py::arg("size_k"),
py::arg("is_k_full"),
py::arg("use_atomic_add"),
py::arg("use_fp32_reduce"),
py::arg("is_zp_float"));
m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch,
"get_position_ids_and_mask_encoder_batch function");
/**
* cutlass_scaled_mm.cu
* cutlass_scaled_mm
@@ -1042,73 +762,4 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function");
m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
#ifdef ENABLE_FP8
m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func,
py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"),
py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"),
py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function");
m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func,
py::arg("input"), py::arg("scale"), py::arg("topk_ids"),
py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function");
m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func,
py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function");
#endif
m.def("init_custom_all_reduce", &init_custom_all_reduce, "init all reduce class function");
m.def("all_reduce", &all_reduce, "all reduce function");
m.def("dispose", &dispose, "del function for python");
m.def("meta_size", &meta_size, "meta_size function for Signal struct");
m.def("register_buffer", &register_buffer, "register ipc buffer");
m.def("register_graph_buffers", &register_graph_buffers, "register_graph_buffers");
m.def("allocate_shared_buffer_and_handle", &allocate_shared_buffer_and_handle, "allocate_shared_buffer_and_handle");
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
// speculative decoding Kernel
m.def("speculate_get_padding_offset", &SpeculateGetPaddingOffset, "speculate_get_padding_offset function");
m.def("speculate_get_seq_lens_output", &SpeculateGetSeqLensOutput, "speculate_get_seq_lens_output function");
m.def("speculate_get_output_padding_offset",&SpeculateGetOutputPaddingOffset, "speculate_get_output_padding_offset function");
m.def("speculate_get_token_penalty_multi_scores",&SpecTokenPenaltyMultiScores, "speculate_get_token_penalty_multi_scores function");
m.def("speculate_set_stop_value_multi_seqs",&SpecGetStopFlagsMultiSeqs, "speculate_set_stop_value_multi_seqs function");
m.def("speculate_verify",&SpeculateVerify, "speculate_verify function");
m.def("speculate_update_v3",&SpeculateUpdateV3, "noaux_tc for Deepseekv3 MoE compute function");
m.def("speculate_set_value_by_flags_and_idx",&SpeculateSetValueByFlagsAndIdx, "speculate_set_value_by_flags_and_idx function");
m.def("speculate_save_output", &SpeculateSaveWithOutputMsgStatic, "speculate_save_output function");
m.def("speculate_clear_accept_nums",&SpeculateClearAcceptNums, "speculate_clear_accept_nums function");
m.def("ngram_match", &NgramMatch, "ngram_match function");
m.def("draft_model_postprocess",&DraftModelPostprocess, "draft_model_postprocess function");
m.def("draft_model_preprocess",&DraftModelPreprocess, "draft_model_preprocess function");
m.def("draft_model_update",&DraftModelUpdate, "draft_model_update function");
m.def("eagle_get_hidden_states",&EagleGetHiddenStates, "eagle_get_hidden_states function");
m.def("eagle_get_self_hidden_states", &EagleGetSelfHiddenStates, "eagle_get_self_hidden_states function");
m.def("mtp_step_paddle",&MTPStepPaddle, "mtp_step_paddle function");
m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
}

Some files were not shown because too many files have changed in this diff Show More