mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-03 15:56:49 +08:00
Compare commits
85 Commits
copilot/ad
...
v2.1.1
Author | SHA1 | Date | |
---|---|---|---|
![]() |
c49c43d51c | ||
![]() |
a424ab907f | ||
![]() |
10a95f8ed5 | ||
![]() |
b9af800edd | ||
![]() |
64cf769bee | ||
![]() |
3364af767b | ||
![]() |
578b8c5da2 | ||
![]() |
8517e04956 | ||
![]() |
aad9d3564e | ||
![]() |
6039cdc2c5 | ||
![]() |
6545994c58 | ||
![]() |
6a90cfd144 | ||
![]() |
47e6270dec | ||
![]() |
80db7fce05 | ||
![]() |
96aed92e4a | ||
![]() |
d8444e22ca | ||
![]() |
df27a488b1 | ||
![]() |
b1f8f1aa07 | ||
![]() |
4e369c7fa7 | ||
![]() |
f8d3255520 | ||
![]() |
e8af92aab7 | ||
![]() |
8b9f167ccc | ||
![]() |
93d999b830 | ||
![]() |
4d6fb96cd6 | ||
![]() |
c18975366e | ||
![]() |
4a9c04a746 | ||
![]() |
d97aab25bc | ||
![]() |
1b399b91c0 | ||
![]() |
8bf48dfab8 | ||
![]() |
fcdc5c2c54 | ||
![]() |
5d4d38674f | ||
![]() |
d07338f932 | ||
![]() |
3ffbc98179 | ||
![]() |
edd13aad66 | ||
![]() |
1065406ed3 | ||
![]() |
570ad54b51 | ||
![]() |
9af57513b3 | ||
![]() |
2e6d97f5eb | ||
![]() |
ff030d9090 | ||
![]() |
5a829fc7af | ||
![]() |
d998efbc17 | ||
![]() |
8a15bdc0c8 | ||
![]() |
ad8ea68906 | ||
![]() |
101605869c | ||
![]() |
28918702c2 | ||
![]() |
02596fc537 | ||
![]() |
03347626a6 | ||
![]() |
b2df0311b8 | ||
![]() |
d1d321bafd | ||
![]() |
dc5d3ff5a0 | ||
![]() |
f0a707e06f | ||
![]() |
4870919682 | ||
![]() |
a375378cc1 | ||
![]() |
192f9caab4 | ||
![]() |
81092c0fe3 | ||
![]() |
ad816f20f4 | ||
![]() |
37b76158f9 | ||
![]() |
fe2094609f | ||
![]() |
b4bb54b56b | ||
![]() |
eeec4bd15e | ||
![]() |
d2592750f7 | ||
![]() |
25f51b0611 | ||
![]() |
9b07f85f6d | ||
![]() |
2fe31c6f0f | ||
![]() |
a33e557732 | ||
![]() |
054c790642 | ||
![]() |
ca4e4ab911 | ||
![]() |
c000cff744 | ||
![]() |
86ff68be4b | ||
![]() |
702c313ed1 | ||
![]() |
6706ccb37e | ||
![]() |
1b6f482c15 | ||
![]() |
5d3bf308f6 | ||
![]() |
f672a34f95 | ||
![]() |
bc0b92bba4 | ||
![]() |
3dd8492601 | ||
![]() |
bd77a3a643 | ||
![]() |
9561603ed9 | ||
![]() |
e26313a355 | ||
![]() |
4367c09a5f | ||
![]() |
8e789dcb67 | ||
![]() |
5f6fc7f7b9 | ||
![]() |
d4059cabf0 | ||
![]() |
c8dd5976ae | ||
![]() |
4880c16be3 |
6
.github/workflows/_build_linux.yml
vendored
6
.github/workflows/_build_linux.yml
vendored
@@ -124,14 +124,12 @@ jobs:
|
||||
echo "Date Only: $DATE_ONLY"
|
||||
export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
|
||||
fi
|
||||
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
|
||||
pip config set install.trusted-host pip.baidu.com
|
||||
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install wheel
|
||||
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
# 编译RDMA
|
||||
export ENABLE_FD_RDMA=1
|
||||
bash build.sh 1 python false [${COMPILE_ARCH}]
|
||||
|
65
.github/workflows/_logprob_test_linux.yml
vendored
65
.github/workflows/_logprob_test_linux.yml
vendored
@@ -62,18 +62,22 @@ jobs:
|
||||
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
|
||||
run: |
|
||||
runner_name="${{ runner.name }}"
|
||||
last_char="${runner_name: -1}"
|
||||
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
|
||||
|
||||
if [[ "$last_char" =~ [0-7] ]]; then
|
||||
DEVICES="$last_char"
|
||||
else
|
||||
DEVICES="0"
|
||||
fi
|
||||
|
||||
FLASK_PORT=$((9160 + DEVICES * 100))
|
||||
FD_API_PORT=$((9180 + DEVICES * 100))
|
||||
FD_ENGINE_QUEUE_PORT=$((9150 + DEVICES * 100))
|
||||
FD_METRICS_PORT=$((9170 + DEVICES * 100))
|
||||
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
|
||||
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
|
||||
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
|
||||
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
|
||||
echo "Test ENV Parameter:"
|
||||
echo "========================================================="
|
||||
echo "FLASK_PORT=${FLASK_PORT}"
|
||||
echo "FD_API_PORT=${FD_API_PORT}"
|
||||
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
|
||||
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
|
||||
echo "DEVICES=${DEVICES}"
|
||||
echo "========================================================="
|
||||
|
||||
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||
@@ -85,9 +89,34 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PARENT_DIR=$(dirname "$WORKSPACE")
|
||||
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
|
||||
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
|
||||
echo "==== LOG_FILE is ${LOG_FILE} ===="
|
||||
|
||||
docker run --ipc=host --pid=host --net=host \
|
||||
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
|
||||
|
||||
for port in "${PORTS[@]}"; do
|
||||
PIDS=$(lsof -t -i :$port || true)
|
||||
if [ -n "$PIDS" ]; then
|
||||
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
|
||||
echo "$PIDS" | xargs -r kill -9
|
||||
echo "Port $port cleared" | tee -a $LOG_FILE
|
||||
else
|
||||
echo "Port $port is free" | tee -a $LOG_FILE
|
||||
fi
|
||||
done
|
||||
|
||||
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
|
||||
|
||||
echo "========================================================="
|
||||
echo "Ensuring no stale container named ${runner_name} ..."
|
||||
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
|
||||
echo "Removing stale container: ${runner_name}"
|
||||
docker rm -f ${runner_name} || true
|
||||
fi
|
||||
|
||||
docker run --rm --ipc=host --pid=host --net=host \
|
||||
--name ${runner_name} \
|
||||
-v $(pwd):/workspace \
|
||||
-w /workspace \
|
||||
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
|
||||
@@ -100,13 +129,11 @@ jobs:
|
||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
|
||||
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
|
||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
|
||||
pip config set install.trusted-host pip.baidu.com
|
||||
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
python -m pip install ${fastdeploy_wheel_url}
|
||||
|
||||
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
|
||||
|
148
.github/workflows/_pre_ce_test.yml
vendored
Normal file
148
.github/workflows/_pre_ce_test.yml
vendored
Normal file
@@ -0,0 +1,148 @@
|
||||
name: Pre-CE-Test
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
DOCKER_IMAGE:
|
||||
description: "Build Images"
|
||||
required: true
|
||||
type: string
|
||||
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126"
|
||||
FASTDEPLOY_ARCHIVE_URL:
|
||||
description: "URL of the compressed FastDeploy code archive."
|
||||
required: true
|
||||
type: string
|
||||
FASTDEPLOY_WHEEL_URL:
|
||||
description: "URL of the FastDeploy Wheel."
|
||||
required: true
|
||||
type: string
|
||||
CACHE_DIR:
|
||||
description: "Cache Dir Use"
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
MODEL_CACHE_DIR:
|
||||
description: "Cache Dir Use"
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.event.pull_request.number }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
run_ce_cases:
|
||||
runs-on: [self-hosted, PRE_CE_RUN_2Card]
|
||||
steps:
|
||||
- name: Print current runner name
|
||||
run: |
|
||||
echo "Current runner name: ${{ runner.name }}"
|
||||
- name: Code Prepare
|
||||
shell: bash
|
||||
env:
|
||||
docker_image: ${{ inputs.DOCKER_IMAGE }}
|
||||
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
|
||||
run: |
|
||||
set -x
|
||||
REPO="https://github.com/${{ github.repository }}.git"
|
||||
FULL_REPO="${{ github.repository }}"
|
||||
REPO_NAME="${FULL_REPO##*/}"
|
||||
BASE_BRANCH="${{ github.base_ref }}"
|
||||
|
||||
# Clean the repository directory before starting
|
||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||
-e "REPO_NAME=${REPO_NAME}" \
|
||||
${docker_image} /bin/bash -c '
|
||||
if [ -d ${REPO_NAME} ]; then
|
||||
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||
rm -rf ${REPO_NAME}*
|
||||
fi
|
||||
'
|
||||
|
||||
wget -q ${fd_archive_url}
|
||||
tar -xf FastDeploy.tar.gz
|
||||
rm -rf FastDeploy.tar.gz
|
||||
cd FastDeploy
|
||||
git config --global user.name "FastDeployCI"
|
||||
git config --global user.email "fastdeploy_ci@example.com"
|
||||
git log -n 3 --oneline
|
||||
|
||||
- name: Run CI unittest
|
||||
env:
|
||||
docker_image: ${{ inputs.DOCKER_IMAGE }}
|
||||
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
|
||||
CACHE_DIR: ${{ inputs.CACHE_DIR }}
|
||||
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
|
||||
run: |
|
||||
runner_name="${{ runner.name }}"
|
||||
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
|
||||
|
||||
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
|
||||
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
|
||||
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
|
||||
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
|
||||
echo "Test ENV Parameter:"
|
||||
echo "========================================================="
|
||||
echo "FLASK_PORT=${FLASK_PORT}"
|
||||
echo "FD_API_PORT=${FD_API_PORT}"
|
||||
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
|
||||
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
|
||||
echo "DEVICES=${DEVICES}"
|
||||
echo "========================================================="
|
||||
|
||||
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
|
||||
touch "${CACHE_DIR}/gitconfig"
|
||||
fi
|
||||
|
||||
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
|
||||
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
|
||||
echo "==== LOG_FILE is ${LOG_FILE} ===="
|
||||
|
||||
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
|
||||
|
||||
for port in "${PORTS[@]}"; do
|
||||
PIDS=$(lsof -t -i :$port || true)
|
||||
if [ -n "$PIDS" ]; then
|
||||
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
|
||||
echo "$PIDS" | xargs -r kill -9
|
||||
echo "Port $port cleared" | tee -a $LOG_FILE
|
||||
else
|
||||
echo "Port $port is free" | tee -a $LOG_FILE
|
||||
fi
|
||||
done
|
||||
|
||||
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
|
||||
|
||||
echo "========================================================="
|
||||
echo "Ensuring no stale container named ${runner_name} ..."
|
||||
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
|
||||
echo "Removing stale container: ${runner_name}"
|
||||
docker rm -f ${runner_name} || true
|
||||
fi
|
||||
|
||||
docker run --rm --net=host \
|
||||
--name ${runner_name} \
|
||||
-v $(pwd):/workspace \
|
||||
-w /workspace \
|
||||
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
|
||||
-e "MODEL_PATH=/ModelData" \
|
||||
-e "FD_API_PORT=${FD_API_PORT}" \
|
||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||
-e "FLASK_PORT=${FLASK_PORT}" \
|
||||
-e "fd_wheel_url=${fd_wheel_url}" \
|
||||
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
|
||||
git config --global --add safe.directory /workspace/FastDeploy
|
||||
cd FastDeploy
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
python -m pip install ${fd_wheel_url}
|
||||
bash scripts/run_pre_ce.sh
|
||||
'
|
215
.github/workflows/_unit_test_coverage.yml
vendored
215
.github/workflows/_unit_test_coverage.yml
vendored
@@ -22,13 +22,19 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
MODEL_CACHE_DIR:
|
||||
description: "Cache Dir Use"
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
|
||||
jobs:
|
||||
run_tests_with_coverage:
|
||||
runs-on: [self-hosted, GPU-h1z1-4Cards]
|
||||
runs-on: [self-hosted, GPU-h1z1-2Cards]
|
||||
outputs:
|
||||
diff_cov_file_url: ${{ steps.cov_upload.outputs.diff_cov_file_url }}
|
||||
unittest_failed_url: ${{ steps.unittest_failed.outputs.unittest_failed_url }}
|
||||
unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
|
||||
diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }}
|
||||
steps:
|
||||
- name: Code Prepare
|
||||
shell: bash
|
||||
@@ -66,58 +72,110 @@ jobs:
|
||||
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
|
||||
CACHE_DIR: ${{ inputs.CACHE_DIR }}
|
||||
BASE_REF: ${{ github.event.pull_request.base.ref }}
|
||||
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
|
||||
run: |
|
||||
set -x
|
||||
runner_name="${{ runner.name }}"
|
||||
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||
set -x
|
||||
runner_name="${{ runner.name }}"
|
||||
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
|
||||
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
|
||||
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
|
||||
|
||||
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
|
||||
touch "${CACHE_DIR}/gitconfig"
|
||||
fi
|
||||
PARENT_DIR=$(dirname "$WORKSPACE")
|
||||
echo "PARENT_DIR:$PARENT_DIR"
|
||||
docker run --rm --net=host \
|
||||
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
|
||||
-v $(pwd):/workspace -w /workspace \
|
||||
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
-e "fd_wheel_url=${fd_wheel_url}" \
|
||||
-e "BASE_REF=${BASE_REF}" \
|
||||
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
|
||||
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
|
||||
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
|
||||
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
|
||||
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
|
||||
echo "Test ENV Parameter:"
|
||||
echo "========================================================="
|
||||
echo "FLASK_PORT=${FLASK_PORT}"
|
||||
echo "FD_API_PORT=${FD_API_PORT}"
|
||||
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
|
||||
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
|
||||
echo "DEVICES=${DEVICES}"
|
||||
echo "========================================================="
|
||||
|
||||
git config --global --add safe.directory /workspace/FastDeploy
|
||||
cd FastDeploy
|
||||
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
|
||||
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
|
||||
echo "CACHE_DIR is set to ${CACHE_DIR}"
|
||||
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
|
||||
touch "${CACHE_DIR}/gitconfig"
|
||||
fi
|
||||
|
||||
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
|
||||
pip config set install.trusted-host pip.baidu.com
|
||||
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
|
||||
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
|
||||
echo "==== LOG_FILE is ${LOG_FILE} ===="
|
||||
|
||||
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
|
||||
|
||||
for port in "${PORTS[@]}"; do
|
||||
PIDS=$(lsof -t -i :$port || true)
|
||||
if [ -n "$PIDS" ]; then
|
||||
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
|
||||
echo "$PIDS" | xargs -r kill -9
|
||||
echo "Port $port cleared" | tee -a $LOG_FILE
|
||||
else
|
||||
echo "Port $port is free" | tee -a $LOG_FILE
|
||||
fi
|
||||
done
|
||||
|
||||
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
|
||||
|
||||
echo "========================================================="
|
||||
echo "Ensuring no stale container named ${runner_name} ..."
|
||||
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
|
||||
echo "Removing stale container: ${runner_name}"
|
||||
docker rm -f ${runner_name} || true
|
||||
fi
|
||||
|
||||
docker run --rm --net=host \
|
||||
--name ${runner_name} \
|
||||
--cap-add=SYS_PTRACE --shm-size=64G \
|
||||
-v $(pwd):/workspace -w /workspace \
|
||||
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
|
||||
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
|
||||
-e "MODEL_PATH=/ModelData" \
|
||||
-e "FD_API_PORT=${FD_API_PORT}" \
|
||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||
-e "FLASK_PORT=${FLASK_PORT}" \
|
||||
-e TZ="Asia/Shanghai" \
|
||||
-e "fd_wheel_url=${fd_wheel_url}" \
|
||||
-e "BASE_REF=${BASE_REF}" \
|
||||
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
|
||||
|
||||
git config --global --add safe.directory /workspace/FastDeploy
|
||||
cd FastDeploy
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
|
||||
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
python -m pip install coverage
|
||||
python -m pip install diff-cover
|
||||
python -m pip install ${fd_wheel_url}
|
||||
if [ -d "test/plugins" ]; then
|
||||
cd test/plugins
|
||||
python setup.py install
|
||||
cd ../..
|
||||
else
|
||||
echo "Warning: test/plugins directory not found, skipping setup.py install"
|
||||
fi
|
||||
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
|
||||
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
|
||||
TEST_EXIT_CODE=0
|
||||
bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
|
||||
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
|
||||
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
|
||||
coverage combine coveragedata/
|
||||
coverage xml -o python_coverage_all.xml
|
||||
COVERAGE_EXIT_CODE=0
|
||||
diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9
|
||||
echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
|
||||
python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml
|
||||
'
|
||||
if [ -f FastDeploy/exit_code.env ]; then
|
||||
cat FastDeploy/exit_code.env >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
python -m pip install coverage
|
||||
python -m pip install diff-cover
|
||||
python -m pip install ${fd_wheel_url}
|
||||
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
|
||||
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
|
||||
TEST_EXIT_CODE=0
|
||||
bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
|
||||
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
|
||||
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
|
||||
coverage combine coveragedata/
|
||||
coverage xml -o python_coverage_all.xml
|
||||
COVERAGE_EXIT_CODE=0
|
||||
diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=90 || COVERAGE_EXIT_CODE=9
|
||||
echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
|
||||
python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml
|
||||
'
|
||||
if [ -f FastDeploy/exit_code.env ]; then
|
||||
cat FastDeploy/exit_code.env >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Upload unit resule and diff coverage to bos
|
||||
id: cov_upload
|
||||
shell: bash
|
||||
@@ -125,30 +183,79 @@ jobs:
|
||||
cd FastDeploy
|
||||
commit_id=${{ github.event.pull_request.head.sha }}
|
||||
pr_num=${{ github.event.pull_request.number }}
|
||||
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}/CoverageData
|
||||
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
|
||||
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
|
||||
push_file=$(realpath bos_tools.py)
|
||||
python -m pip install bce-python-sdk==0.9.29
|
||||
diff_cov_file="diff_coverage.xml"
|
||||
if [ -f ${diff_cov_file} ];then
|
||||
python ${push_file} ${diff_cov_file} ${target_path}
|
||||
python ${push_file} ${diff_cov_file} ${target_path}/CoverageData
|
||||
target_path_stripped="${target_path#paddle-github-action/}"
|
||||
DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${diff_cov_file}
|
||||
DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_file}
|
||||
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_OUTPUT
|
||||
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Determine Unit Succ and whether the coverage rate reaches 90%
|
||||
diff_cov_result_json="diff_coverage.json"
|
||||
if [ -f ${diff_cov_result_json} ];then
|
||||
python ${push_file} ${diff_cov_result_json} ${target_path}/CoverageData
|
||||
target_path_stripped="${target_path#paddle-github-action/}"
|
||||
DIFF_COV_JSON_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_result_json}
|
||||
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_OUTPUT
|
||||
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_ENV
|
||||
fi
|
||||
unittest_result="test/failed_tests.log"
|
||||
if [ -s ${unittest_result} ];then
|
||||
python ${push_file} ${unittest_result} ${target_path}/UnitTestResult
|
||||
target_path_stripped="${target_path#paddle-github-action/}"
|
||||
UNIT_TEST_RESULT_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_result}
|
||||
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Check Unit Test Success
|
||||
shell: bash
|
||||
run: |
|
||||
cd FastDeploy
|
||||
if [ "$TEST_EXIT_CODE" -eq 8 ]; then
|
||||
filename=$(basename "$unittest_failed_url")
|
||||
if [ -z "${unittest_failed_url}" ]; then
|
||||
echo "No diff unit failed file URL provided."
|
||||
else
|
||||
rm -rf "${filename}"
|
||||
wget -O ${filename} ${unittest_failed_url} || echo "Download unittest file failed, but continuing..."
|
||||
fi
|
||||
echo "Unit tests failed (exit code 8)"
|
||||
if [ -f "${filename}" ];then
|
||||
echo "Failed test cases:"
|
||||
cat "${filename}"
|
||||
fi
|
||||
exit "$TEST_EXIT_CODE"
|
||||
fi
|
||||
echo "All tests passed"
|
||||
|
||||
- name: Verify Code Coverage Threshold (80%)
|
||||
shell: bash
|
||||
run: |
|
||||
cd FastDeploy
|
||||
if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then
|
||||
echo "Coverage generation failed (exit code 9)"
|
||||
filename=$(basename "$diff_cov_result_json_url")
|
||||
if [ -z "${diff_cov_result_json_url}" ]; then
|
||||
echo "No diff cov result file URL provided."
|
||||
else
|
||||
rm -rf "${filename}"
|
||||
wget -O ${filename} ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..."
|
||||
fi
|
||||
if [ -f "${filename}" ];then
|
||||
echo "Failed test cases:"
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
jq . "${filename}"
|
||||
else
|
||||
cat "${filename}"
|
||||
fi
|
||||
fi
|
||||
exit "$COVERAGE_EXIT_CODE"
|
||||
fi
|
||||
echo "All tests and coverage passed"
|
||||
echo "coverage passed"
|
||||
exit 0
|
||||
|
||||
diff_coverage_report:
|
||||
|
1
.github/workflows/approve.yml
vendored
1
.github/workflows/approve.yml
vendored
@@ -33,7 +33,6 @@ jobs:
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Run approval check script
|
||||
run: |
|
||||
|
89
.github/workflows/ci.yml
vendored
89
.github/workflows/ci.yml
vendored
@@ -1,89 +0,0 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- develop
|
||||
- 'release/*'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.event.pull_request.number }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: [self-hosted, GPU-L20-4Card]
|
||||
steps:
|
||||
- name: Print current runner name
|
||||
run: |
|
||||
echo "Current runner name: ${{ runner.name }}"
|
||||
# Because the system version is lower than 2.23, the checkout cannot be used.
|
||||
# - name: Checkout code
|
||||
# uses: actions/checkout@v4
|
||||
|
||||
- name: Code Checkout
|
||||
env:
|
||||
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
|
||||
run: |
|
||||
REPO="https://github.com/${{ github.repository }}.git"
|
||||
FULL_REPO="${{ github.repository }}"
|
||||
REPO_NAME="${FULL_REPO##*/}"
|
||||
BASE_BRANCH="${{ github.base_ref }}"
|
||||
# Clean the repository directory before starting
|
||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||
-e "REPO_NAME=${REPO_NAME}" \
|
||||
-e "BASE_BRANCH=${BASE_BRANCH}" \
|
||||
${docker_image} /bin/bash -c '
|
||||
if [ -d ${REPO_NAME} ]; then
|
||||
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||
rm -rf ${REPO_NAME}
|
||||
fi
|
||||
'
|
||||
git config --global user.name "FastDeployCI"
|
||||
git config --global user.email "fastdeploy_ci@example.com"
|
||||
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
|
||||
cd FastDeploy
|
||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
|
||||
git merge pr/${{ github.event.pull_request.number }}
|
||||
git log -n 3 --oneline
|
||||
else
|
||||
git checkout ${{ github.sha }}
|
||||
git log -n 3 --oneline
|
||||
fi
|
||||
|
||||
- name: Run CI unittest
|
||||
env:
|
||||
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
|
||||
run: |
|
||||
runner_name="${{ runner.name }}"
|
||||
last_char="${runner_name: -1}"
|
||||
|
||||
if [ "${last_char}" = "1" ]; then
|
||||
gpu_id=2
|
||||
DEVICES="2,3"
|
||||
else
|
||||
gpu_id=0
|
||||
DEVICES="0,1"
|
||||
fi
|
||||
FD_API_PORT=$((9180 + gpu_id * 100))
|
||||
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
|
||||
FD_METRICS_PORT=$((9170 + gpu_id * 100))
|
||||
|
||||
PARENT_DIR=$(dirname "$WORKSPACE")
|
||||
echo "PARENT_DIR:$PARENT_DIR"
|
||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||
-v "/ssd4/GithubActions/gitconfig:/etc/gitconfig:ro" \
|
||||
-v "/ssd4/GithubActions/ModelData:/ModelData:ro" \
|
||||
-v "/ssd4/GithubActions/CacheDir:/root/.cache" \
|
||||
-v "/ssd4/GithubActions/ConfigDir:/root/.config" \
|
||||
-e "MODEL_PATH=/ModelData" \
|
||||
-e "FD_API_PORT=${FD_API_PORT}" \
|
||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c "
|
||||
git config --global --add safe.directory /workspace/FastDeploy
|
||||
cd FastDeploy
|
||||
bash scripts/run_ci.sh
|
||||
"
|
2
.github/workflows/ci_xpu.yml
vendored
2
.github/workflows/ci_xpu.yml
vendored
@@ -13,7 +13,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
CI_XPU:
|
||||
runs-on: [self-hosted, XPU-P800-8Card]
|
||||
runs-on: [self-hosted, XPU-P800-8Card-release]
|
||||
steps:
|
||||
- name: Print current runner name
|
||||
run: |
|
||||
|
2
.github/workflows/gh-pages.yml
vendored
2
.github/workflows/gh-pages.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.x
|
||||
- run: pip install mkdocs-material mkdocs-get-deps mkdocs-material-extensions mkdocs-multilang
|
||||
- run: pip install mkdocs-material mkdocs-get-deps mkdocs-material-extensions mkdocs-multilang mkdocs-static-i18n
|
||||
- name: Deploy to GitHub Pages
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
19
.github/workflows/pr_build_and_test.yml
vendored
19
.github/workflows/pr_build_and_test.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
with:
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
|
||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||
COMPILE_ARCH: "90"
|
||||
COMPILE_ARCH: "89,90"
|
||||
WITH_NIGHTLY_BUILD: "OFF"
|
||||
FD_VERSION: "0.0.0"
|
||||
|
||||
@@ -39,16 +39,27 @@ jobs:
|
||||
needs: [clone,build]
|
||||
uses: ./.github/workflows/_unit_test_coverage.yml
|
||||
with:
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
|
||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
||||
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
||||
|
||||
logprob_test:
|
||||
name: Run FastDeploy LogProb Tests
|
||||
needs: [build]
|
||||
uses: ./.github/workflows/_logprob_test_linux.yml
|
||||
with:
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
|
||||
PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
|
||||
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
||||
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
|
||||
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
||||
|
||||
pre_ce_test:
|
||||
name: Extracted partial CE model tasks to run in CI.
|
||||
needs: [clone,build]
|
||||
uses: ./.github/workflows/_pre_ce_test.yml
|
||||
with:
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
|
||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
|
||||
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
|
||||
|
22
README.md
22
README.md
@@ -1,3 +1,4 @@
|
||||
English | [简体中文](README_CN.md)
|
||||
<p align="center">
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
|
||||
</p>
|
||||
@@ -22,11 +23,10 @@
|
||||
</p>
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
|
||||
# FastDeploy : Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
|
||||
|
||||
## News
|
||||
|
||||
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务,即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
|
||||
**[2025-08] 🔥 Released FastDeploy v2.1:** A brand-new KV Cache scheduling strategy has been introduced, and expanded support for PD separation and CUDA Graph across more models. Enhanced hardware support has been added for platforms like Kunlun and Hygon, along with comprehensive optimizations to improve the performance of both the service and inference engine.
|
||||
|
||||
**[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
|
||||
|
||||
@@ -50,14 +50,15 @@
|
||||
|
||||
## Installation
|
||||
|
||||
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
|
||||
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, **Hygon DCUs** and other hardware. For detailed installation instructions:
|
||||
|
||||
- [NVIDIA GPU](./docs/get_started/installation/nvidia_gpu.md)
|
||||
- [Kunlunxin XPU](./docs/get_started/installation/kunlunxin_xpu.md)
|
||||
- [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
|
||||
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
|
||||
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
|
||||
|
||||
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
|
||||
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU and MetaX GPU are currently under development and testing. Stay tuned for updates!
|
||||
|
||||
## Get Started
|
||||
|
||||
@@ -68,18 +69,19 @@ Learn how to use FastDeploy through our documentation:
|
||||
- [Offline Inference Development](./docs/offline_inference.md)
|
||||
- [Online Service Deployment](./docs/online_serving/README.md)
|
||||
- [Full Supported Models List](./docs/supported_models.md)
|
||||
- [Best Practices](./docs/best_practices/README.md)
|
||||
|
||||
## Supported Models
|
||||
|
||||
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| ✅ |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| ✅ | 128K |
|
||||
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ✅ | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ✅ | ✅ | ✅ | ❌ | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ✅ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
|
94
README_CN.md
Normal file
94
README_CN.md
Normal file
@@ -0,0 +1,94 @@
|
||||
[English](README.md) | 简体中文
|
||||
<p align="center">
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
|
||||
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/FastDeploy?color=9cc"></a>
|
||||
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
|
||||
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://trendshift.io/repositories/4046" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4046" alt="PaddlePaddle%2FFastDeploy | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></br>
|
||||
<a href="https://paddlepaddle.github.io/FastDeploy/zh/get_started/installation/nvidia_gpu/"><b> 安装指导 </b></a>
|
||||
|
|
||||
<a href="https://paddlepaddle.github.io/FastDeploy/zh/get_started/quick_start"><b> 快速入门 </b></a>
|
||||
|
|
||||
<a href="https://paddlepaddle.github.io/FastDeploy/zh/supported_models/"><b> 支持模型列表 </b></a>
|
||||
|
||||
</p>
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
# FastDeploy :基于飞桨的大语言模型与视觉语言模型推理部署工具包
|
||||
|
||||
## 最新活动
|
||||
**[2025-08] 🔥 FastDeploy v2.1 全新发布:** 全新的KV Cache调度策略,更多模型支持PD分离和CUDA Graph,昆仑、海光等更多硬件支持增强,全方面优化服务和推理引擎的性能。
|
||||
|
||||
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务,即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
|
||||
|
||||
## 关于
|
||||
|
||||
**FastDeploy** 是基于飞桨(PaddlePaddle)的大语言模型(LLM)与视觉语言模型(VLM)推理部署工具包,提供**开箱即用的生产级部署方案**,核心技术特性包括:
|
||||
|
||||
- 🚀 **负载均衡式PD分解**:工业级解决方案,支持上下文缓存与动态实例角色切换,在保障SLO达标和吞吐量的同时优化资源利用率
|
||||
- 🔄 **统一KV缓存传输**:轻量级高性能传输库,支持智能NVLink/RDMA选择
|
||||
- 🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
|
||||
- 🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
|
||||
- ⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充
|
||||
- 🖥️ **多硬件支持**:NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等
|
||||
|
||||
## 要求
|
||||
|
||||
- 操作系统: Linux
|
||||
- Python: 3.10 ~ 3.12
|
||||
|
||||
## 安装
|
||||
|
||||
FastDeploy 支持在**英伟达(NVIDIA)GPU**、**昆仑芯(Kunlunxin)XPU**、**天数(Iluvatar)GPU**、**燧原(Enflame)GCU**、**海光(Hygon)DCU** 以及其他硬件上进行推理部署。详细安装说明如下:
|
||||
|
||||
- [英伟达 GPU](./docs/zh/get_started/installation/nvidia_gpu.md)
|
||||
- [昆仑芯 XPU](./docs/zh/get_started/installation/kunlunxin_xpu.md)
|
||||
- [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
|
||||
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
|
||||
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
|
||||
|
||||
**注意:** 我们正在积极拓展硬件支持范围。目前,包括昇腾(Ascend)NPU 和 沐曦(MetaX)GPU 在内的其他硬件平台正在开发测试中。敬请关注更新!
|
||||
|
||||
## 入门指南
|
||||
|
||||
通过我们的文档了解如何使用 FastDeploy:
|
||||
- [10分钟快速部署](./docs/zh/get_started/quick_start.md)
|
||||
- [ERNIE-4.5 部署](./docs/zh/get_started/ernie-4.5.md)
|
||||
- [ERNIE-4.5-VL 部署](./docs/zh/get_started/ernie-4.5-vl.md)
|
||||
- [离线推理](./docs/zh/offline_inference.md)
|
||||
- [在线服务](./docs/zh/online_serving/README.md)
|
||||
- [模型支持列表](./docs/zh/supported_models.md)
|
||||
- [最佳实践](./docs/zh/best_practices/README.md)
|
||||
|
||||
## 支持模型列表
|
||||
|
||||
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| ✅ |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| ✅ | 128K |
|
||||
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ✅ | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ✅ | ✅ | ✅ | ❌ | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ✅ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|
||||
## 进阶用法
|
||||
|
||||
- [量化](./docs/zh/quantization/README.md)
|
||||
- [分离式部署](./docs/zh/features/disaggregated.md)
|
||||
- [投机解码](./docs/zh/features/speculative_decoding.md)
|
||||
- [前缀缓存](./docs/zh/features/prefix_caching.md)
|
||||
- [分块预填充](./docs/zh/features/chunked_prefill.md)
|
||||
|
||||
## 致谢
|
||||
|
||||
FastDeploy 依据 [Apache-2.0 开源许可证](./LICENSE). 进行授权。在开发过程中,我们参考并借鉴了 [vLLM](https://github.com/vllm-project/vllm) 的部分代码,以保持接口兼容性,在此表示衷心感谢。
|
@@ -1061,12 +1061,11 @@ void MultiQueryAppendAttention(
|
||||
if (!is_decoder) {
|
||||
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
|
||||
}
|
||||
const int num_chunks = div_up(max_dec_len, chunk_size);
|
||||
|
||||
const int num_chunks = div_up(max_seq_len, chunk_size);
|
||||
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
|
||||
dim3 blocks(32, num_warps);
|
||||
|
||||
if (num_chunks <= 1) {
|
||||
if (num_chunks <= 0) {
|
||||
auto nosplit_kv_kernel =
|
||||
multi_query_append_attention_warp1_4_kernel<NV_TYPE,
|
||||
false,
|
||||
@@ -1161,8 +1160,8 @@ void MultiQueryAppendAttention(
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1208,8 +1207,8 @@ void MultiQueryAppendAttention(
|
||||
seq_lens_encoder.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1226,14 +1225,14 @@ void MultiQueryAppendAttention(
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
<<<grids_merge, blocks_merge, 0, stream>>>(
|
||||
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
|
||||
static_cast<float *>(tmp_m->ptr()),
|
||||
@@ -1244,8 +1243,8 @@ void MultiQueryAppendAttention(
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
|
@@ -1285,10 +1285,11 @@ void MultiQueryAppendC4Attention(
|
||||
if (!is_decoder) {
|
||||
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
|
||||
}
|
||||
const int num_chunks = div_up(max_dec_len, chunk_size);
|
||||
|
||||
const int num_chunks = div_up(max_seq_len, chunk_size);
|
||||
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
|
||||
dim3 blocks(32, num_warps);
|
||||
if (num_chunks <= 1) {
|
||||
if (num_chunks <= 0) {
|
||||
auto nosplit_kv_kernel =
|
||||
multi_query_append_attention_c4_warp1_4_kernel<NV_TYPE,
|
||||
uint8_t,
|
||||
@@ -1392,15 +1393,15 @@ void MultiQueryAppendC4Attention(
|
||||
const_cast<uint8_t *>(cache_v.data<uint8_t>()),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_scale.data<T>())),
|
||||
cache_k_zp ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(cache_k_zp.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(cache_k_zp.get().data<T>()))
|
||||
: nullptr,
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_scale.data<T>())),
|
||||
cache_v_zp ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(cache_v_zp.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(cache_v_zp.get().data<T>()))
|
||||
: nullptr,
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1445,8 +1446,8 @@ void MultiQueryAppendC4Attention(
|
||||
seq_lens_encoder.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1463,14 +1464,14 @@ void MultiQueryAppendC4Attention(
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
<<<grids_merge, blocks_merge, 0, stream>>>(
|
||||
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
|
||||
static_cast<float *>(tmp_m->ptr()),
|
||||
@@ -1481,8 +1482,8 @@ void MultiQueryAppendC4Attention(
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
|
@@ -1254,10 +1254,10 @@ void MultiQueryAppendC8Attention(
|
||||
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
|
||||
}
|
||||
|
||||
const int num_chunks = div_up(max_dec_len, chunk_size);
|
||||
const int num_chunks = div_up(max_seq_len, chunk_size);
|
||||
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
|
||||
dim3 blocks(32, num_warps);
|
||||
if (num_chunks <= 1) {
|
||||
if (num_chunks <= 0) {
|
||||
auto nosplit_kv_kernel =
|
||||
multi_query_append_attention_c8_warp1_4_kernel<NV_TYPE,
|
||||
uint8_t,
|
||||
@@ -1377,8 +1377,8 @@ void MultiQueryAppendC8Attention(
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_scale.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_scale.data<T>())),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1418,8 +1418,8 @@ void MultiQueryAppendC8Attention(
|
||||
seq_lens_encoder.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
@@ -1436,14 +1436,14 @@ void MultiQueryAppendC8Attention(
|
||||
constexpr int blockx = HEAD_DIM / vec_size;
|
||||
constexpr int blocky = (128 + blockx - 1) / blockx;
|
||||
dim3 grids_merge(min(sm_count * 4, token_num),
|
||||
num_heads);
|
||||
num_heads);
|
||||
dim3 blocks_merge(blockx, blocky);
|
||||
merge_multi_chunks_v2_kernel<NV_TYPE,
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
vec_size,
|
||||
blocky,
|
||||
HEAD_DIM,
|
||||
OUT_NV_TYPE,
|
||||
ENABLE_PREFILL>
|
||||
<<<grids_merge, blocks_merge, 0, stream>>>(
|
||||
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
|
||||
static_cast<float *>(tmp_m->ptr()),
|
||||
@@ -1454,8 +1454,8 @@ void MultiQueryAppendC8Attention(
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
shift_bias ? reinterpret_cast<NV_TYPE *>(
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
const_cast<T *>(shift_bias.get().data<T>()))
|
||||
: nullptr,
|
||||
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
|
||||
smooth_weight.get().data<T>()))
|
||||
: nullptr,
|
||||
|
@@ -1,6 +1,6 @@
|
||||
FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
|
||||
ARG PADDLE_VERSION=3.1.0
|
||||
ARG FD_VERSION=2.0.0
|
||||
FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0
|
||||
ARG PADDLE_VERSION=3.1.1
|
||||
ARG FD_VERSION=2.1.0
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
|
@@ -16,11 +16,17 @@ RUN apt-get update && apt-get install -y libibverbs-dev librdmacm-dev cmake pybi
|
||||
|
||||
# uninstall existing package
|
||||
RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y
|
||||
# install paddlepaddle
|
||||
# install paddlepaddle-xpu
|
||||
RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==${PADDLE_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
|
||||
RUN python -m pip install --no-cache-dir fastdeploy-xpu==${FD_VERSION} -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
|
||||
RUN mkdir -p /workspace/deps && cd /workspace/deps && \
|
||||
wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
|
||||
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
|
||||
|
||||
ENV PATH=/workspace/deps/xre/bin:$PATH
|
||||
|
||||
ENV http_proxy=""
|
||||
ENV https_proxy=""
|
||||
ENV no_proxy=""
|
||||
|
@@ -2,7 +2,8 @@
|
||||
## Environmental Preparation
|
||||
### 1.1 Hardware requirements
|
||||
The minimum number of GPUs required to deploy `ERNIE-4.5-0.3B` on the following hardware for each quantization is as follows:
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|-----|-----|-----|-----|
|
||||
|H800 80GB| 1 | 1 | 1 |
|
||||
|A800 80GB| 1 | 1 | / |
|
||||
@@ -24,12 +25,12 @@ The minimum number of GPUs required to deploy `ERNIE-4.5-0.3B` on the following
|
||||
### 2.1 Basic: Launching the Service
|
||||
Start the service by following command:
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-0.3B-Paddle \
|
||||
--tensor-parallel-size 1 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
@@ -75,9 +76,9 @@ Add the following lines to the startup parameters
|
||||
--use-cudagraph
|
||||
```
|
||||
Notes:
|
||||
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
|
||||
2. When CUDAGraph is enabled, only single-card inference is supported, that is, `--tensor-parallel-size 1`
|
||||
3. When CUDAGraph is enabled, it is not supported to enable `Chunked Prefill` and `Prefix Caching` at the same time
|
||||
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
|
||||
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
|
||||
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
|
||||
|
||||
#### 2.2.6 Rejection Sampling
|
||||
**Idea:**
|
@@ -2,7 +2,8 @@
|
||||
## Environmental Preparation
|
||||
### 1.1 Hardware requirements
|
||||
The minimum number of GPUs required to deploy `ERNIE-4.5-21B-A3B` on the following hardware for each quantization is as follows:
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|-----|-----|-----|-----|
|
||||
|H800 80GB| 1 | 1 | 1 |
|
||||
|A800 80GB| 1 | 1 | / |
|
||||
@@ -24,12 +25,12 @@ The minimum number of GPUs required to deploy `ERNIE-4.5-21B-A3B` on the followi
|
||||
### 2.1 Basic: Launching the Service
|
||||
Start the service by following command:
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--tensor-parallel-size 1 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
@@ -85,9 +86,9 @@ Add the following lines to the startup parameters
|
||||
--use-cudagraph
|
||||
```
|
||||
Notes:
|
||||
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../parameters.md) for related configuration parameter descriptions
|
||||
2. When CUDAGraph is enabled, only single-card inference is supported, that is, `--tensor-parallel-size 1`
|
||||
3. When CUDAGraph is enabled, it is not supported to enable `Chunked Prefill` and `Prefix Caching` at the same time
|
||||
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
|
||||
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
|
||||
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
|
||||
|
||||
#### 2.2.6 Rejection Sampling
|
||||
**Idea:**
|
@@ -2,7 +2,8 @@
|
||||
## Environmental Preparation
|
||||
### 1.1 Hardware requirements
|
||||
The minimum number of GPUs required to deploy `ERNIE-4.5-300B-A47B` on the following hardware for each quantization is as follows:
|
||||
| | WINT8 | WINT4 | FP8 | WINT2 | W4A8 |
|
||||
|
||||
| | WINT8 | WINT4 | FP8 | WINT2 | W4A8 |
|
||||
|-----|-----|-----|-----|-----|-----|
|
||||
|H800 80GB| 8 | 4 | 8 | 2 | 4 |
|
||||
|A800 80GB| 8 | 4 | / | 2 | 4 |
|
||||
@@ -21,12 +22,12 @@ The minimum number of GPUs required to deploy `ERNIE-4.5-300B-A47B` on the follo
|
||||
### 2.1 Basic: Launching the Service
|
||||
Start the service by following command:
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--tensor-parallel-size 8 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
- `--quantization`: indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed).
|
||||
@@ -123,5 +124,20 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--splitwise-role "decode"
|
||||
```
|
||||
|
||||
#### 2.2.8 CUDAGraph
|
||||
**Idea:**
|
||||
CUDAGraph is a GPU computing acceleration technology provided by NVIDIA. It achieves efficient execution and optimization of GPU tasks by capturing CUDA operation sequences into a graph structure. The core idea of CUDAGraph is to encapsulate a series of GPU computing and memory operations into a re-executable graph, thereby reducing CPU-GPU communication overhead, reducing kernel startup latency, and improving overall computing performance.
|
||||
|
||||
**How to enable:**
|
||||
Add the following lines to the startup parameters
|
||||
```
|
||||
--use-cudagraph
|
||||
--enable-custom-all-reduce
|
||||
```
|
||||
Notes:
|
||||
1. Usually, no additional parameters need to be set, but CUDAGraph will generate some additional memory overhead, which may need to be adjusted in some scenarios with limited memory. For detailed parameter adjustments, please refer to [GraphOptimizationBackend](../features/graph_optimization.md) for related configuration parameter descriptions
|
||||
2. When CUDAGraph is enabled, if running with multi-GPUs TP>1, `--enable-custom-all-reduce` must be specified at the same time.
|
||||
3. When CUDAGraph is enabled, the scenario of `max-model-len > 32768` is not currently supported.
|
||||
|
||||
## FAQ
|
||||
If you encounter any problems during use, you can refer to [FAQ](./FAQ.md).
|
134
docs/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md
Normal file
134
docs/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md
Normal file
@@ -0,0 +1,134 @@
|
||||
|
||||
# ERNIE-4.5-VL-28B-A3B-Paddle
|
||||
|
||||
## 1. Environment Preparation
|
||||
### 1.1 Support Status
|
||||
|
||||
The minimum number of cards required for deployment on the following hardware is as follows:
|
||||
|
||||
| Device [GPU Mem] | WINT4 | WINT8 | BFLOAT16 |
|
||||
|:----------:|:----------:|:------:| :------:|
|
||||
| A30 [24G] | 2 | 2 | 4 |
|
||||
| L20 [48G] | 1 | 1 | 2 |
|
||||
| H20 [144G] | 1 | 1 | 1 |
|
||||
| A100 [80G] | 1 | 1 | 1 |
|
||||
| H800 [80G] | 1 | 1 | 1 |
|
||||
|
||||
### 1.2 Install Fastdeploy
|
||||
|
||||
Installation process reference documentation [FastDeploy GPU Install](../get_started/installation/nvidia_gpu.md)
|
||||
|
||||
> ⚠️ Precautions:
|
||||
> - FastDeploy only supports models in Paddle format – please ensure to download models with the `-Paddle` file extension.
|
||||
> - The model name will trigger an automatic download. If the model has already been downloaded, you can directly use the absolute path to the model's download location.
|
||||
|
||||
## 2.How to Use
|
||||
### 2.1 Basic: Launching the Service
|
||||
**Example 1:** Deploying a 32K Context Service on a Single RTX 4090 GPU
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 256 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
**Example 2:** Deploying a 128K Context Service on Dual H800 GPUs
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 256 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
|
||||
> ⚠️ For versions 2.1 and above, the new scheduler needs to be enabled via an environment variable `ENABLE_V1_KVCACHE_SCHEDULER=1`. Otherwise, some requests may be truncated before reaching the maximum length or return empty results.
|
||||
|
||||
An example is a set of configurations that can run stably while also delivering relatively good performance. If you have further requirements for precision or performance, please continue reading the content below.
|
||||
### 2.2 Advanced: How to Achieve Better Performance
|
||||
|
||||
#### 2.2.1 Evaluating Application Scenarios and Setting Parameters Correctly
|
||||
> **Context Length**
|
||||
- **Parameters:** `--max-model-len`
|
||||
- **Description:** Controls the maximum context length that the model can process.
|
||||
- **Recommendation:** Longer context lengths may reduce throughput. Adjust based on actual needs, with a maximum supported context length of **128k** (131,072).
|
||||
|
||||
⚠️ Note: Longer context lengths will significantly increase GPU memory requirements. Ensure your hardware resources are sufficient before setting a longer context.
|
||||
> **Maximum sequence count**
|
||||
- **Parameters:** `--max-num-seqs`
|
||||
- **Description:** Controls the maximum number of sequences the service can handle, supporting a range of 1 to 256.
|
||||
- **Recommendation:** If you are unsure of the average number of sequences per request in your actual application scenario, we recommend setting it to **256**. If the average number of sequences per request in your application is significantly fewer than 256, we suggest setting it to a slightly higher value than the average to further reduce GPU memory usage and optimize service performance.
|
||||
|
||||
> **Multi-image and multi-video input**
|
||||
- **Parameters**:`--limit-mm-per-prompt`
|
||||
- **Description**:Our model supports multi-image and multi-video input in a single prompt. Please use this **Parameters** setting to limit the number of images/videos per request, ensuring efficient resource utilization.
|
||||
- **Recommendation**:We recommend setting the number of images and videos in a single prompt to **100 each** to balance performance and memory usage.
|
||||
|
||||
> **Available GPU memory ratio during initialization**
|
||||
- **Parameters:** `--gpu-memory-utilization`
|
||||
- **Description:** Controls the available GPU memory for FastDeploy service initialization. The default value is 0.9, meaning 10% of the memory is reserved for backup.
|
||||
- **Recommendation:** It is recommended to use the default value of 0.9. If an "out of memory" error occurs during stress testing, you may attempt to reduce this value.
|
||||
|
||||
#### 2.2.2 Chunked Prefill
|
||||
- **Parameters:** `--enable-chunked-prefill`
|
||||
- **Description:** Enabling `chunked prefill` can **reduce peak GPU memory usage** and **improve service throughput**.
|
||||
- **Other relevant configurations**:
|
||||
|
||||
`--max-num-batched-tokens`:Limit the maximum number of tokens per chunk, with a recommended setting of 384.
|
||||
|
||||
#### 2.2.3 **Quantization precision**
|
||||
- **Parameters:** `--quantization`
|
||||
|
||||
- **Supported precision types:**
|
||||
- WINT4 (Suitable for most users)
|
||||
- WINT8
|
||||
- BFLOAT16 (When the `--quantization` parameter is not set, BFLOAT16 is used by default.)
|
||||
|
||||
- **Recommendation:**
|
||||
- Unless you have extremely stringent precision requirements, we strongly recommend using WINT4 quantization. This will significantly reduce memory consumption and increase throughput.
|
||||
- If slightly higher precision is required, you may try WINT8.
|
||||
- Only consider using BFLOAT16 if your application scenario demands extreme precision, as it requires significantly more GPU memory.
|
||||
|
||||
#### 2.2.4 **Adjustable environment variables**
|
||||
> **Rejection sampling:**`FD_SAMPLING_CLASS=rejection`
|
||||
- **Description:** Rejection sampling involves generating samples from a proposal distribution that is easy to sample from, thereby avoiding explicit sorting and achieving an effect of improving sampling speed, which can enhance inference performance.
|
||||
- **Recommendation:** This is a relatively aggressive optimization strategy that affects the results, and we are still conducting comprehensive validation of its impact. If you have high performance requirements and can accept potential compromises in results, you may consider enabling this strategy.
|
||||
|
||||
> **Attention Hyperparameter:**`FLAGS_max_partition_size=1024`
|
||||
- **Description:** The hyperparameters for the Append Attention (default) backend have been tested on commonly used datasets, and our results show that setting it to 1024 can significantly improve decoding speed, especially in long-text scenarios.
|
||||
- **Recommendation:** In the future, it will be modified to an automatic adjustment mechanism. If you have high performance requirements, you may consider enabling it.
|
||||
|
||||
## 3. FAQ
|
||||
**Note:** Deploying multimodal services requires adding parameters to the configuration `--enable-mm`.
|
||||
|
||||
### 3.1 Out of Memory
|
||||
If the service prompts "Out of Memory" during startup, please try the following solutions:
|
||||
1. Ensure no other processes are occupying GPU memory;
|
||||
2. Use WINT4/WINT8 quantization and enable chunked prefill;
|
||||
3. Reduce context length and maximum sequence count as needed;
|
||||
4. Increase the number of GPU cards for deployment (e.g., 2 or 4 cards) by modifying the parameter `--tensor-parallel-size 2` or `--tensor-parallel-size 4`.
|
||||
|
||||
If the service starts normally but later reports insufficient memory, try:
|
||||
1. Adjust the initial GPU memory utilization ratio by modifying `--gpu-memory-utilization`;
|
||||
2. Increase the number of deployment cards (parameter adjustment as above).
|
110
docs/best_practices/ERNIE-4.5-VL-424B-A47B-Paddle.md
Normal file
110
docs/best_practices/ERNIE-4.5-VL-424B-A47B-Paddle.md
Normal file
@@ -0,0 +1,110 @@
|
||||
|
||||
# ERNIE-4.5-VL-424B-A47B-Paddle
|
||||
|
||||
## 1. Environment Preparation
|
||||
### 1.1 Support Status
|
||||
The minimum number of cards required for deployment on the following hardware is as follows:
|
||||
|
||||
| Device [GPU Mem] | WINT4 | WINT8 | BFLOAT16 |
|
||||
|:----------:|:----------:|:------:| :------:|
|
||||
| H20 [144G] | 8 | 8 | 8 |
|
||||
| A100 [80G] | 8 | 8 | - |
|
||||
| H800 [80G] | 8 | 8 | - |
|
||||
|
||||
### 1.2 Install Fastdeploy
|
||||
|
||||
Installation process reference documentation [FastDeploy GPU Install](../get_started/installation/nvidia_gpu.md)
|
||||
|
||||
> ⚠️ Precautions:
|
||||
> - FastDeploy only supports models in Paddle format – please ensure to download models with the `-Paddle` file extension.
|
||||
> - The model name will trigger an automatic download. If the model has already been downloaded, you can directly use the absolute path to the model's download location.
|
||||
|
||||
## 2.How to Use
|
||||
### 2.1 Basic: Launching the Service
|
||||
**Example 1:** Deploying a 128K context service on 8x H800 GPUs.
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-424B-A47B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 8 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 16 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
|
||||
> ⚠️ For versions 2.1 and above, the new scheduler needs to be enabled via an environment variable `ENABLE_V1_KVCACHE_SCHEDULER=1`. Otherwise, some requests may be truncated before reaching the maximum length or return empty results.
|
||||
|
||||
An example is a set of configurations that can run stably while also delivering relatively good performance. If you have further requirements for precision or performance, please continue reading the content below.
|
||||
### 2.2 Advanced: How to Achieve Better Performance
|
||||
|
||||
#### 2.2.1 Evaluating Application Scenarios and Setting Parameters Correctly
|
||||
> **Context Length**
|
||||
- **Parameters:** `--max-model-len`
|
||||
- **Description:** Controls the maximum context length that the model can process.
|
||||
- **Recommendation:** Longer context lengths may reduce throughput. Adjust based on actual needs, with a maximum supported context length of **128k** (131,072).
|
||||
|
||||
⚠️ Note: Longer context lengths will significantly increase GPU memory requirements. Ensure your hardware resources are sufficient before setting a longer context.
|
||||
> **Maximum sequence count**
|
||||
- **Parameters:** `--max-num-seqs`
|
||||
- **Description:** Controls the maximum number of sequences the service can handle, supporting a range of 1 to 256.
|
||||
- **Recommendation:** If you are unsure of the average number of sequences per request in your actual application scenario, we recommend setting it to **256**. If the average number of sequences per request in your application is significantly fewer than 256, we suggest setting it to a slightly higher value than the average to further reduce GPU memory usage and optimize service performance.
|
||||
|
||||
> **Multi-image and multi-video input**
|
||||
- **Parameters**:`--limit-mm-per-prompt`
|
||||
- **Description**:Our model supports multi-image and multi-video input in a single prompt. Please use this **Parameters** setting to limit the number of images/videos per request, ensuring efficient resource utilization.
|
||||
- **Recommendation**:We recommend setting the number of images and videos in a single prompt to **100 each** to balance performance and memory usage.
|
||||
|
||||
> **Available GPU memory ratio during initialization**
|
||||
- **Parameters:** `--gpu-memory-utilization`
|
||||
- **Description:** Controls the available GPU memory for FastDeploy service initialization. The default value is 0.9, meaning 10% of the memory is reserved for backup.
|
||||
- **Recommendation:** It is recommended to use the default value of 0.9. If an "out of memory" error occurs during stress testing, you may attempt to reduce this value.
|
||||
|
||||
#### 2.2.2 Chunked Prefill
|
||||
- **Parameters:** `--enable-chunked-prefill`
|
||||
- **Description:** Enabling `chunked prefill` can **reduce peak GPU memory usage** and **improve service throughput**.
|
||||
- **Other relevant configurations**:
|
||||
|
||||
`--max-num-batched-tokens`:Limit the maximum number of tokens per chunk, with a recommended setting of 384.
|
||||
|
||||
#### 2.2.3 **Quantization precision**
|
||||
- **Parameters:** `--quantization`
|
||||
|
||||
- **Supported precision types:**
|
||||
- wint4 (Suitable for most users)
|
||||
- wint8
|
||||
- bfloat16 (When the `--quantization` parameter is not set, bfloat16 is used by default.)
|
||||
|
||||
- **Recommendation:**
|
||||
- Unless you have extremely stringent precision requirements, we strongly recommend using wint4 quantization. This will significantly reduce memory consumption and increase throughput.
|
||||
- If slightly higher precision is required, you may try wint8.
|
||||
- Only consider using bfloat16 if your application scenario demands extreme precision, as it requires significantly more GPU memory.
|
||||
|
||||
#### 2.2.4 **Adjustable environment variables**
|
||||
> **Rejection sampling:**`FD_SAMPLING_CLASS=rejection`
|
||||
- **Description:** Rejection sampling involves generating samples from a proposal distribution that is easy to sample from, thereby avoiding explicit sorting and achieving an effect of improving sampling speed, which can enhance inference performance.
|
||||
- **Recommendation:** This is a relatively aggressive optimization strategy that affects the results, and we are still conducting comprehensive validation of its impact. If you have high performance requirements and can accept potential compromises in results, you may consider enabling this strategy.
|
||||
|
||||
> **Attention Hyperparameter:**`FLAGS_max_partition_size=1024`
|
||||
- **Description:** The hyperparameters for the Append Attention (default) backend have been tested on commonly used datasets, and our results show that setting it to 1024 can significantly improve decoding speed, especially in long-text scenarios.
|
||||
- **Recommendation:** In the future, it will be modified to an automatic adjustment mechanism. If you have high performance requirements, you may consider enabling it.
|
||||
|
||||
## 3. FAQ
|
||||
**Note:** Deploying multimodal services requires adding parameters to the configuration `--enable-mm`.
|
||||
|
||||
### 3.1 Out of Memory
|
||||
If the service prompts "Out of Memory" during startup, please try the following solutions:
|
||||
1. Ensure no other processes are occupying GPU memory;
|
||||
2. Use wint4/wint8 quantization and enable chunked prefill;
|
||||
3. Reduce context length and maximum sequence count as needed.
|
||||
|
||||
If the service starts normally but later reports insufficient memory, try:
|
||||
1. Adjust the initial GPU memory utilization ratio by modifying `--gpu-memory-utilization`.
|
7
docs/best_practices/README.md
Normal file
7
docs/best_practices/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Optimal Deployment
|
||||
|
||||
- [ERNIE-4.5-0.3B-Paddle.md](ERNIE-4.5-0.3B-Paddle.md)
|
||||
- [ERNIE-4.5-21B-A3B-Paddle.md](ERNIE-4.5-21B-A3B-Paddle.md)
|
||||
- [ERNIE-4.5-300B-A47B-Paddle.md](ERNIE-4.5-300B-A47B-Paddle.md)
|
||||
- [ERNIE-4.5-VL-28B-A3B-Paddle](ERNIE-4.5-VL-28B-A3B-Paddle.md)
|
||||
- [ERNIE-4.5-VL-424B-A47B-Paddle](ERNIE-4.5-VL-424B-A47B-Paddle.md)
|
112
docs/features/graph_optimization.md
Normal file
112
docs/features/graph_optimization.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Graph optimization technology in FastDeploy
|
||||
|
||||
FastDeploy's `GraphOptimizationBackend` integrates a variety of graph optimization technologies:
|
||||
+ **CUDA Graph**:A mechanism that starts multiple GPU operations with a single CPU operation reduces overhead and improves performance
|
||||
|
||||
+ **StaticGraph to DynamicGraph**:Convert dynamic graphs to static graphs, optimize calculation graphs and improve execution efficiency using global graph structure information
|
||||
|
||||
+ **CINN Neural Network Compiler**:Perform IR conversion, Kernel fusion, Kernel generation and other computational graph compilation optimization methods based on static graphs to achieve comprehensive optimization
|
||||
|
||||
Any dynamic situations such as data-dependent control flow, Host-Device synchronization, model input of address/shape changes, dynamic Kernel execution configuration, etc. will cause CUDAGraph Capture/Replay to fail. The scenarios facing LLM inference are dynamic input lengths, dynamic Batch Size, and flexible Attention implementation and multi-device communication, making CUDAGraph difficult to apply.
|
||||
|
||||
The mainstream open source solution implements CUDA Graph based on static graphs, with a deep technology stack. FastDeploy not only supports static graphs, neural network compilers, and CUDAGraph combination optimization, but also supports directly applying CUDAGraph in dynamic graphs, which has lower development costs, but the dynamic situations faced are more complex.
|
||||
|
||||
FastDeploy's `GraphOptimizationBackend` design architecture is as follows, **some functions are still under development, so it is recommended to read the first chapter carefully using restrictions**.
|
||||
|
||||

|
||||
|
||||
## 1. GraphOptimizationBackend Current usage restrictions
|
||||
In the CUDAGraph multi-device inference task, you need to use the Custom all-reduce operator to perform multi-card all-reduce.
|
||||
|
||||
Before version 2.2, neither the CUDAGraph nor the Custom all-reduce operators were enabled by default. You need to add `--enable-custom-all-reduce` to the startup command to manually enable it.
|
||||
|
||||
### 1.1 The multi-device scene needs to be enabled Custom all-reduce
|
||||
The `FLAGS_max_partition_size` environment variable controls the `gridDim` execution configuration of Kernel in CascadeAppend Attention, and dynamic execution configuration will cause CUDAGraph execution to fail.
|
||||
[PR#3223](https://github.com/PaddlePaddle/FastDeploy/pull/3223) Fixed this issue, but it still existed in Release versions before 2.2.
|
||||
|
||||
**Problem self-checking method:**
|
||||
+ Calculate `div_up(max_model_len, max_partition_size)` based on the value of `FLAGS_max_partition_size` (default is 32K) and `max_model_len` in the startup parameters. The result is greater than `1` and it can run normally when it is equal to `1`.
|
||||
|
||||
**Solution:**
|
||||
1. Adjust the values of `FLAGS_max_partition_size` and `max_model_len` without triggering dynamic execution of configuration.
|
||||
2. Close CUDAGraph
|
||||
|
||||
## 2. GraphOptimizationBackend related configuration parameters
|
||||
Currently, only user configuration of the following parameters is supported:
|
||||
+ `use_cudagraph` : bool = False
|
||||
+ `graph_optimization_config` : Dict[str, Any]
|
||||
+ `graph_opt_level`: int = 0
|
||||
+ `use_cudagraph`: bool = False
|
||||
+ `cudagraph_capture_sizes` : List[int] = None
|
||||
|
||||
CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts.
|
||||
|
||||
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
|
||||
+ `0`: Use Dynamic compute graph, default to 0
|
||||
+ `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
|
||||
+ `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
|
||||
|
||||
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
|
||||
For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
|
||||
|
||||
When CudaGraph is enabled in the default configuration, a list of Batch Sizes that CudaGraph needs to capture will be automatically set based on the 'max_num_deqs' parameter. The logic for generating the list of Batch Sizes that need to be captured is as follows:
|
||||
|
||||
1. Generate a candidate list with a range of [1,1024] Batch Size.
|
||||
|
||||
```
|
||||
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
|
||||
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
||||
# Batch Size (128, 144, ... 240, 256]
|
||||
candidate_capture_sizes += [16 * i for i in range(9, 17)]
|
||||
# Batch Size (256, 288, ... 992, 1024]
|
||||
candidate_capture_sizes += [32 * i for i in range(17, 33)]
|
||||
```
|
||||
|
||||
2. Crop the candidate list based on the user set 'max_num_deqs' to obtain a CudaGraph capture list with a range of [1,' max_num_deqs'].
|
||||
|
||||
Users can also customize the batch size list that needs to be captured by CudaGraph through the parameter `cudagraph_capture_sizes` in`--graph-optimization-config`:
|
||||
|
||||
```
|
||||
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
|
||||
```
|
||||
|
||||
### 2.1 CudaGraph related parameters
|
||||
|
||||
Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy:
|
||||
+ Additional input Buffer overhead
|
||||
+ CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework
|
||||
|
||||
FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter to calculate available memory for `KVCache`, after initializing `KVCache` then uses remaining memory to initialize CudaGraph. Since CudaGraph is not enabled by default currently, using default startup parameters may encounter `Out of memory` errors, can try following solutions:
|
||||
+ Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph.
|
||||
+ Lower `max_num_seqs` to decrease the maximum concurrency.
|
||||
+ Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes`
|
||||
|
||||
+ Before use, must ensure loaded model is properly decorated with ```@support_graph_optimization```.
|
||||
|
||||
```python
|
||||
# 1. import decorator
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
# 2. add decorator
|
||||
@support_graph_optimization
|
||||
class Ernie4_5_Model(nn.Layer): # Note decorator is added to nn.Layer subclass
|
||||
...
|
||||
|
||||
# 3. modify parameter passing in ModelForCasualLM subclass's self.model()
|
||||
class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
||||
...
|
||||
def forward(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding, # specify parameter name when passing
|
||||
forward_meta=forward_meta)
|
||||
return hidden_statesfrom fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
@support_graph_optimization
|
||||
class Ernie45TModel(nn.Layer): # Note decorator is added to nn.Layer subclass
|
||||
...
|
||||
```
|
1
docs/features/images/GraphOptBackendArch.svg
Normal file
1
docs/features/images/GraphOptBackendArch.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 30 KiB |
71
docs/features/multi-node_deployment.md
Normal file
71
docs/features/multi-node_deployment.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# Multi-Node Deployment
|
||||
|
||||
## Overview
|
||||
Multi-node deployment addresses scenarios where a single machine's GPU memory is insufficient to support deployment of large models by enabling tensor parallelism across multiple machines.
|
||||
|
||||
## Environment Preparation
|
||||
#### Network Requirements
|
||||
1. All nodes must be within the same local network
|
||||
2. Ensure bidirectional connectivity between all nodes (test using `ping` and `nc -zv`)
|
||||
|
||||
#### Software Requirements
|
||||
1. Install the same version of FastDeploy on all nodes
|
||||
2. [Recommended] Install and configure MPI (OpenMPI or MPICH)
|
||||
|
||||
## Tensor Parallel Deployment
|
||||
|
||||
### Recommended Launch Method
|
||||
We recommend using mpirun for one-command startup without manually starting each node.
|
||||
|
||||
### Usage Instructions
|
||||
1. Execute the same command on all machines
|
||||
2. The IP order in the `ips` parameter determines the node startup sequence
|
||||
3. The first IP will be designated as the master node
|
||||
4. Ensure all nodes can resolve each other's hostnames
|
||||
|
||||
* Online inference startup example:
|
||||
```shell
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--tensor-parallel-size 16 \
|
||||
--ips 192.168.1.101,192.168.1.102
|
||||
```
|
||||
|
||||
* Offline startup example:
|
||||
```python
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
|
||||
model_name_or_path = "baidu/ERNIE-4.5-300B-A47B-Paddle"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
|
||||
llm = LLM(model=model_name_or_path, tensor_parallel_size=16, ips="192.168.1.101,192.168.1.102")
|
||||
if llm._check_master():
|
||||
output = llm.generate(prompts="Who are you?", use_tqdm=True, sampling_params=sampling_params)
|
||||
print(output)
|
||||
```
|
||||
|
||||
* Notes:
|
||||
- Only the master node can receive completion requests
|
||||
- Always send requests to the master node (the first IP in the ips list)
|
||||
- The master node will distribute workloads across all nodes
|
||||
|
||||
### Parameter Description
|
||||
|
||||
#### `ips` Parameter
|
||||
- **Type**: `string`
|
||||
- **Format**: Comma-separated IPv4 addresses
|
||||
- **Description**: Specifies the IP addresses of all nodes in the deployment group
|
||||
- **Required**: Only for multi-node deployments
|
||||
- **Example**: `"192.168.1.101,192.168.1.102,192.168.1.103"`
|
||||
|
||||
#### `tensor_parallel_size` Parameter
|
||||
- **Type**: `integer`
|
||||
- **Description**: Total number of GPUs across all nodes
|
||||
- **Required**: Yes
|
||||
- **Example**: For 2 nodes with 8 GPUs each, set to 16
|
99
docs/features/plugins.md
Normal file
99
docs/features/plugins.md
Normal file
@@ -0,0 +1,99 @@
|
||||
# FastDeploy Plugin Mechanism Documentation
|
||||
|
||||
FastDeploy supports a plugin mechanism that allows users to extend functionality without modifying the core code. Plugins are automatically discovered and loaded through Python's `entry_points` mechanism.
|
||||
|
||||
## How Plugins Work
|
||||
|
||||
Plugins are essentially registration functions that are automatically called when FastDeploy starts. The system uses the `load_plugins_by_group` function to ensure that all processes (including child processes in distributed training scenarios) have loaded the required plugins before official operations begin.
|
||||
|
||||
## Plugin Discovery Mechanism
|
||||
|
||||
FastDeploy uses Python's `entry_points` mechanism to discover and load plugins. Developers need to register their plugins in the specified entry point group in their project.
|
||||
|
||||
### Example: Creating a Plugin
|
||||
|
||||
#### 1. How Plugin Work
|
||||
|
||||
Assuming you have a custom model class `MyModelForCasualLM` and a pretrained class `MyPretrainedModel`, you can write the following registration function:
|
||||
|
||||
```python
|
||||
# File: fd_add_dummy_model/__init__.py or fd_add_dummy_model/register.py
|
||||
from fastdeploy.model_registry import ModelRegistry
|
||||
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
|
||||
from fastdeploy.config import ErnieArchitectures
|
||||
|
||||
def register():
|
||||
if "MyModelForCasualLM" not in ModelRegistry.get_supported_archs():
|
||||
if MyModelForCasualLM.name().startswith("Ernie"):
|
||||
ErnieArchitectures.register_ernie_model_arch(MyModelForCasualLM)
|
||||
ModelRegistry.register_model_class(MyModelForCasualLM)
|
||||
ModelRegistry.register_pretrained_model(MyPretrainedModel)
|
||||
```
|
||||
Assuming you have a custom model_runner class `MyModelRunner`, you can write the following registration function:
|
||||
```python
|
||||
# File: fd_add_dummy_model_runner/__init__.py
|
||||
from .my_model_runner import MyModelRunner
|
||||
|
||||
def get_runner():
|
||||
return MyModelRunner
|
||||
```
|
||||
|
||||
#### 2. Register Plugin in `setup.py`
|
||||
|
||||
```python
|
||||
# setup.py
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="fastdeploy-plugins",
|
||||
version="0.1",
|
||||
packages=["fd_add_dummy_model", "fd_add_dummy_model_runner"],
|
||||
entry_points={
|
||||
"fastdeploy.model_register_plugins": [
|
||||
"fd_add_dummy_model = fd_add_dummy_model:register",
|
||||
],
|
||||
"fastdeploy.model_runner_plugins": [
|
||||
"model_runner = fd_add_dummy_model:get_runner"
|
||||
],
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
## Plugin Structure
|
||||
|
||||
Plugins consist of three components:
|
||||
|
||||
| Component | Description |
|
||||
|-----------|-------------|
|
||||
| **Plugin Group** | The functional group to which the plugin belongs, for example:<br> - `fastdeploy.model_register_plugins`: for model registration<br> - `fastdeploy.model_runner_plugins`: for model runner registration<br> Users can customize groups as needed. |
|
||||
| **Plugin Name** | The unique identifier for each plugin (e.g., `fd_add_dummy_model`), which can be controlled via the `FD_PLUGINS` environment variable to determine whether to load the plugin. |
|
||||
| **Plugin Value** | Format is `module_name:function_name`, pointing to the entry function that executes the registration logic. |
|
||||
|
||||
## Controlling Plugin Loading Behavior
|
||||
|
||||
By default, FastDeploy loads all registered plugins. To load only specific plugins, you can set the environment variable:
|
||||
|
||||
```bash
|
||||
export FD_PLUGINS=fastdeploy-plugins
|
||||
```
|
||||
|
||||
Multiple plugin names can be separated by commas:
|
||||
|
||||
```bash
|
||||
export FD_PLUGINS=plugin_a,plugin_b
|
||||
```
|
||||
|
||||
## Reference Example
|
||||
|
||||
Please refer to the example plugin implementation in the project directory:
|
||||
```
|
||||
./test/plugins/
|
||||
```
|
||||
|
||||
It contains a complete plugin structure and `setup.py` configuration example.
|
||||
|
||||
## Summary
|
||||
|
||||
Through the plugin mechanism, users can easily add custom models or functional modules to FastDeploy without modifying the core source code. This not only enhances system extensibility but also facilitates third-party developers in extending functionality.
|
||||
|
||||
For further plugin development, please refer to the `model_registry` and `plugin_loader` modules in the FastDeploy source code.
|
@@ -98,7 +98,7 @@ curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
|
||||
{"role": "user", "content": "How old are you"}
|
||||
],
|
||||
"top_p": 0.8,
|
||||
"top_k": 50
|
||||
"top_k": 20
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -117,7 +117,7 @@ response = client.chat.completions.create(
|
||||
],
|
||||
stream=True,
|
||||
top_p=0.8,
|
||||
top_k=50
|
||||
extra_body={"top_k": 20, "min_p":0.1}
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta:
|
||||
@@ -159,8 +159,7 @@ response = client.chat.completions.create(
|
||||
],
|
||||
stream=True,
|
||||
top_p=0.8,
|
||||
top_k=20,
|
||||
min_p=0.1
|
||||
extra_body={"top_k": 20, "min_p":0.1}
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta:
|
||||
|
@@ -23,6 +23,7 @@ Execute the following command to start the service. For parameter configurations
|
||||
>💡 **Note**: Since the model parameter size is 424B-A47B, on an 80G * 8 GPU machine, specify ```--quantization wint4``` (wint8 is also supported).
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-424B-A47B-Paddle \
|
||||
--port 8180 --engine-worker-queue-port 8181 \
|
||||
@@ -31,7 +32,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--enable-mm \
|
||||
--mm-processor-kwargs '{"video_max_frames": 30}' \
|
||||
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
|
||||
--reasoning-parser ernie-45-vl
|
||||
|
@@ -21,6 +21,7 @@ Specify `--model baidu/ERNIE-4.5-300B-A47B-Paddle` during deployment to automati
|
||||
Execute the following command to start the service. For configuration details, refer to the [Parameter Guide](../parameters.md):
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8180 --engine-worker-queue-port 8181 \
|
||||
|
@@ -1,12 +1,12 @@
|
||||
# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
|
||||
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version.
|
||||
The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
|
||||
|
||||
## Machine Preparation
|
||||
First, you need to prepare a machine with the following configurations:
|
||||
First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
|
||||
|
||||
| CPU | Memory | Card | Hard Disk|
|
||||
| :---: | :---: | :---: | :---: |
|
||||
| x86 | 1TB| 8xBI150| 1TB|
|
||||
| x86 | 1TB| 16xBI150| 1TB|
|
||||
|
||||
Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions.
|
||||
|
||||
@@ -32,7 +32,7 @@ docker exec -it paddle_infer bash
|
||||
```bash
|
||||
pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||
pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
|
||||
pip3 install fastdeploy_iluvatar_gpu==2.1.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
|
||||
```
|
||||
|
||||
## Prepare the inference demo script
|
||||
@@ -46,6 +46,7 @@ script list below:
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
export FD_DEBUG=1
|
||||
python3 run_demo.py
|
||||
```
|
||||
@@ -64,7 +65,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||
|
||||
# load the model
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
|
||||
llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8')
|
||||
|
||||
# Perform batch inference
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@@ -118,3 +119,281 @@ Now, let's break down each step:
|
||||
**Step 3: Drawing the
|
||||
The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
|
||||
```
|
||||
|
||||
## Run ernie4.5 300B model with the GSM8K dataset
|
||||
|
||||
1. Download GSM8K dataset
|
||||
|
||||
```bash
|
||||
wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
|
||||
```
|
||||
|
||||
2. Prepare `bench_gsm8k.py`
|
||||
|
||||
```python
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
|
||||
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
INVALID = -9999999
|
||||
|
||||
|
||||
def call_generate(prompt, **kwargs):
|
||||
"""
|
||||
Generates response based on the input prompt.
|
||||
|
||||
Args:
|
||||
prompt (str): The input prompt text.
|
||||
**kwargs: Keyword arguments, including server IP address and port number.
|
||||
|
||||
Returns:
|
||||
str: The response generated based on the prompt.
|
||||
|
||||
"""
|
||||
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"max_tokens": 2047,
|
||||
"top_p": 0.95,
|
||||
"do_sample": True,
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||
out = response.json()
|
||||
return out["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
def get_one_example(lines, i, include_answer):
|
||||
"""
|
||||
Retrieves a question-answer example from the given list of text lines.
|
||||
|
||||
Args:
|
||||
lines (list of dict): A list of question-answer pairs.
|
||||
i (int): The index of the question-answer pair to retrieve from lines.
|
||||
include_answer (bool): Whether to include the answer in the returned string.
|
||||
|
||||
Returns:
|
||||
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||
|
||||
"""
|
||||
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||
if include_answer:
|
||||
ret += " " + lines[i]["answer"]
|
||||
return ret
|
||||
|
||||
|
||||
def get_few_shot_examples(lines, k):
|
||||
"""
|
||||
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||
|
||||
Args:
|
||||
lines (list): A list containing text lines.
|
||||
k (int): The number of examples to select.
|
||||
|
||||
Returns:
|
||||
str: A string composed of k examples, separated by two newline characters.
|
||||
"""
|
||||
ret = ""
|
||||
for i in range(k):
|
||||
ret += get_one_example(lines, i, True) + "\n\n"
|
||||
return ret
|
||||
|
||||
|
||||
def get_answer_value(answer_str):
|
||||
"""
|
||||
Extracts numerical values from an answer string and returns them.
|
||||
|
||||
Args:
|
||||
answer_str (str): The string containing the answer.
|
||||
|
||||
Returns:
|
||||
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||
"""
|
||||
answer_str = answer_str.replace(",", "")
|
||||
numbers = re.findall(r"\d+", answer_str)
|
||||
if len(numbers) < 1:
|
||||
return INVALID
|
||||
try:
|
||||
return ast.literal_eval(numbers[-1])
|
||||
except SyntaxError:
|
||||
return INVALID
|
||||
|
||||
|
||||
def read_jsonl(filename: str):
|
||||
"""
|
||||
Reads a JSONL file.
|
||||
|
||||
Args:
|
||||
filename (str): Path to the JSONL file.
|
||||
|
||||
Yields:
|
||||
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||
"""
|
||||
with open(filename) as fin:
|
||||
for line in fin:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""
|
||||
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||
|
||||
Args:
|
||||
args (argparse.Namespace):
|
||||
- num_questions (int): Number of questions to process.
|
||||
- num_shots (int): Number of few-shot learning examples.
|
||||
- ip (str): IP address of the model service.
|
||||
- port (int): Port number of the model service.
|
||||
- parallel (int): Number of questions to process in parallel.
|
||||
- result_file (str): File path to store the results.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
# Read data
|
||||
filename = "test.jsonl"
|
||||
|
||||
lines = list(read_jsonl(filename))
|
||||
|
||||
# Construct prompts
|
||||
num_questions = args.num_questions
|
||||
num_shots = args.num_shots
|
||||
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||
|
||||
questions = []
|
||||
labels = []
|
||||
for i in range(len(lines[:num_questions])):
|
||||
questions.append(get_one_example(lines, i, False))
|
||||
labels.append(get_answer_value(lines[i]["answer"]))
|
||||
assert all(l != INVALID for l in labels)
|
||||
|
||||
states = [None] * len(labels)
|
||||
|
||||
# Use thread pool
|
||||
def get_one_answer(i):
|
||||
answer = call_generate(
|
||||
prompt=few_shot_examples + questions[i],
|
||||
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||
ip=args.ip,
|
||||
port=args.port,
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
else:
|
||||
with ThreadPoolExecutor(args.parallel) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(get_one_answer, list(range(len(questions)))),
|
||||
total=len(questions),
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
preds = []
|
||||
for i in range(len(states)):
|
||||
preds.append(get_answer_value(states[i]))
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
invalid = np.mean(np.array(preds) == INVALID)
|
||||
|
||||
# Print results
|
||||
print(f"Accuracy: {acc:.3f}")
|
||||
print(f"Invalid: {invalid:.3f}")
|
||||
print(f"Latency: {latency:.3f} s")
|
||||
|
||||
with open(args.result_file, "a") as fout:
|
||||
value = {
|
||||
"task": "gsm8k",
|
||||
"backend": "paddlepaddle",
|
||||
"num_gpus": 1,
|
||||
"latency": round(latency, 3),
|
||||
"accuracy": round(acc, 3),
|
||||
"num_requests": args.num_questions,
|
||||
"other": {
|
||||
"num_questions": args.num_questions,
|
||||
"parallel": args.parallel,
|
||||
},
|
||||
}
|
||||
fout.write(json.dumps(value) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--port", type=str, default="8188")
|
||||
parser.add_argument("--num-shots", type=int, default=10)
|
||||
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||
parser.add_argument("--num-questions", type=int, default=1319)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
parser.add_argument("--parallel", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
```
|
||||
|
||||
3. Prepare `run_bench.sh`
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8
|
||||
```
|
||||
|
||||
4. Running the Script
|
||||
|
||||
Firstly, open a terminal and run:
|
||||
```bash
|
||||
./run_bench.sh
|
||||
```
|
||||
After the service is ready, open another terminal and run:
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
It takes about 6.3 hours to run the GSM8K dataset.
|
||||
|
||||
```
|
||||
Accuracy: 0.964
|
||||
Invaild: 0.000
|
||||
Latency: 22918.186 s
|
||||
```
|
||||
|
@@ -5,8 +5,8 @@
|
||||
- OS: Linux
|
||||
- Python: 3.10
|
||||
- XPU Model: P800
|
||||
- XPU Driver Version: ≥ 5.0.21.10
|
||||
- XPU Firmware Version: ≥ 1.31
|
||||
- XPU Driver Version: ≥ 5.0.21.26
|
||||
- XPU Firmware Version: ≥ 1.48
|
||||
|
||||
Verified platform:
|
||||
- CPU: INTEL(R) XEON(R) PLATINUM 8563C / Hygon C86-4G 7490 64-core Processor
|
||||
@@ -15,8 +15,8 @@ Verified platform:
|
||||
- OS: CentOS release 7.6 (Final)
|
||||
- Python: 3.10
|
||||
- XPU Model: P800 (OAM Edition)
|
||||
- XPU Driver Version: 5.0.21.10
|
||||
- XPU Firmware Version: 1.31
|
||||
- XPU Driver Version: 5.0.21.26
|
||||
- XPU Firmware Version: 1.48
|
||||
|
||||
**Note:** Currently, only INTEL or Hygon CPU-based P800 (OAM Edition) servers have been verified. Other CPU types and P800 (PCIe Edition) servers have not been tested yet.
|
||||
|
||||
@@ -25,9 +25,9 @@ Verified platform:
|
||||
```bash
|
||||
mkdir Work
|
||||
cd Work
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
|
||||
docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
|
||||
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \
|
||||
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \
|
||||
/bin/bash
|
||||
docker exec -it fastdeploy-xpu /bin/bash
|
||||
```
|
||||
@@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash
|
||||
### Install PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
```
|
||||
|
||||
Alternatively, you can install the latest version of PaddlePaddle (Not recommended)
|
||||
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
|
||||
### Install FastDeploy (**Do NOT install via PyPI source**)
|
||||
|
||||
```bash
|
||||
python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
Alternatively, you can install the latest version of FastDeploy (Not recommended)
|
||||
@@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa
|
||||
### Install PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
```
|
||||
|
||||
Alternatively, you can install the latest version of PaddlePaddle (Not recommended)
|
||||
|
@@ -13,14 +13,14 @@ The following installation methods are available when your environment meets the
|
||||
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container.
|
||||
|
||||
```shell
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0
|
||||
```
|
||||
|
||||
## 2. Pre-built Pip Installation
|
||||
|
||||
First install paddlepaddle-gpu. For detailed instructions, refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)
|
||||
```shell
|
||||
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
```
|
||||
|
||||
Then install fastdeploy. **Do not install from PyPI**. Use the following methods instead:
|
||||
@@ -58,7 +58,7 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu .
|
||||
|
||||
First install paddlepaddle-gpu. For detailed instructions, refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)
|
||||
```shell
|
||||
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
```
|
||||
|
||||
Then clone the source code and build:
|
||||
|
@@ -16,6 +16,7 @@ For more information about how to install FastDeploy, refer to the [installation
|
||||
After installing FastDeploy, execute the following command in the terminal to start the service. For the configuration method of the startup command, refer to [Parameter Description](../parameters.md)
|
||||
|
||||
```
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-0.3B-Paddle \
|
||||
--port 8180 \
|
||||
|
@@ -19,6 +19,7 @@ For more information about how to install FastDeploy, refer to the [installation
|
||||
After installing FastDeploy, execute the following command in the terminal to start the service. For the configuration method of the startup command, refer to [Parameter Description](../parameters.md)
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
@@ -26,8 +27,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--enable-mm
|
||||
--reasoning-parser ernie-45-vl
|
||||
```
|
||||
|
||||
> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Base-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
|
||||
|
@@ -13,12 +13,12 @@
|
||||
|
||||
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| WIP | 128K |
|
||||
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ✅ | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|
||||
## Documentation
|
||||
|
@@ -39,7 +39,7 @@ Documentation for `SamplingParams`, `LLM.generate`, `LLM.chat`, and output struc
|
||||
```python
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
# 加载模型
|
||||
llm = LLM(model="baidu/ERNIE-4.5-VL-28B-A3B-Paddle", tensor_parallel_size=1, max_model_len=32768, enable_mm=True, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
llm = LLM(model="baidu/ERNIE-4.5-VL-28B-A3B-Paddle", tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=[
|
||||
@@ -127,7 +127,7 @@ for message in messages:
|
||||
})
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
|
||||
llm = LLM(model=PATH, tensor_parallel_size=1, max_model_len=32768, enable_mm=True, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
llm = LLM(model=PATH, tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
outputs = llm.generate(prompts={
|
||||
"prompt": prompt,
|
||||
"multimodal_data": {
|
||||
@@ -183,6 +183,7 @@ For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md).
|
||||
* min_p(float): Minimum probability relative to the maximum probability for a token to be considered (>0 filters low-probability tokens to improve quality)
|
||||
* max_tokens(int): Maximum generated tokens (input + output)
|
||||
* min_tokens(int): Minimum forced generation length
|
||||
* bad_words(list[str]): Prohibited words
|
||||
|
||||
### 2.5 fastdeploy.engine.request.RequestOutput
|
||||
|
||||
|
@@ -8,6 +8,8 @@ When using FastDeploy to deploy models (including offline inference and service
|
||||
|:--------------|:----|:-----------|
|
||||
| ```port``` | `int` | Only required for service deployment, HTTP service port number, default: 8000 |
|
||||
| ```metrics_port``` | `int` | Only required for service deployment, metrics monitoring port number, default: 8001 |
|
||||
| ```max_waiting_time``` | `int` | Only required for service deployment, maximum wait time for establishing a connection upon service request. Default: -1 (indicates no wait time limit).|
|
||||
| ```max_concurrency``` | `int` | Only required for service deployment, the actual number of connections established by the service, default 512 |
|
||||
| ```engine_worker_queue_port``` | `int` | FastDeploy internal engine communication port, default: 8002 |
|
||||
| ```cache_queue_port``` | `int` | FastDeploy internal KVCache process communication port, default: 8003 |
|
||||
| ```max_model_len``` | `int` | Default maximum supported context length for inference, default: 2048 |
|
||||
@@ -19,7 +21,7 @@ When using FastDeploy to deploy models (including offline inference and service
|
||||
| ```tokenizer``` | `str` | Tokenizer name or path, defaults to model path |
|
||||
| ```use_warmup``` | `int` | Whether to perform warmup at startup, will automatically generate maximum length data for warmup, enabled by default when automatically calculating KV Cache |
|
||||
| ```limit_mm_per_prompt``` | `dict[str]` | Limit the amount of multimodal data per prompt, e.g.: {"image": 10, "video": 3}, default: 1 for all |
|
||||
| ```enable_mm``` | `bool` | Whether to support multimodal data (for multimodal models only), default: False |
|
||||
| ```enable_mm``` | `bool` | __[DEPRECATED]__ Whether to support multimodal data (for multimodal models only), default: False |
|
||||
| ```quantization``` | `str` | Model quantization strategy, when loading BF16 CKPT, specifying wint4 or wint8 supports lossless online 4bit/8bit quantization |
|
||||
| ```gpu_memory_utilization``` | `float` | GPU memory utilization, default: 0.9 |
|
||||
| ```num_gpu_blocks_override``` | `int` | Preallocated KVCache blocks, this parameter can be automatically calculated by FastDeploy based on memory situation, no need for user configuration, default: None |
|
||||
@@ -33,8 +35,8 @@ When using FastDeploy to deploy models (including offline inference and service
|
||||
| ```long_prefill_token_threshold``` | `int` | When Chunked Prefill is enabled, requests with token count exceeding this value are considered long requests, default: max_model_len*0.04 |
|
||||
| ```static_decode_blocks``` | `int` | During inference, each request is forced to allocate corresponding number of blocks from Prefill's KVCache for Decode use, default: 2 |
|
||||
| ```reasoning_parser``` | `str` | Specify the reasoning parser to extract reasoning content from model output |
|
||||
| ```use_cudagraph``` | `bool` | Whether to use cuda graph, default: False |
|
||||
|```graph_optimization_config``` | `str` | Parameters related to graph optimization can be configured, with default values of'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }' |
|
||||
| ```use_cudagraph``` | `bool` | Whether to use cuda graph, default False. It is recommended to read [graph_optimization.md](./features/graph_optimization.md) carefully before opening. Custom all-reduce needs to be enabled at the same time in multi-card scenarios. |
|
||||
| ```graph_optimization_config``` | `dict[str]` | Can configure parameters related to calculation graph optimization, the default value is'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }',Detailed description reference [graph_optimization.md](./features/graph_optimization.md)|
|
||||
| ```enable_custom_all_reduce``` | `bool` | Enable Custom all-reduce, default: False |
|
||||
| ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] |
|
||||
| ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None |
|
||||
@@ -44,6 +46,8 @@ When using FastDeploy to deploy models (including offline inference and service
|
||||
| ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
|
||||
| ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
|
||||
| ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
|
||||
| ```tool_call_parser``` | `str` | Specify the function call parser to be used for extracting function call content from the model's output. |
|
||||
| ```tool_parser_plugin``` | `str` | Specify the file path of the tool parser to be registered, so as to register parsers that are not in the code repository. The code format within these parsers must adhere to the format used in the code repository. |
|
||||
|
||||
## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?
|
||||
|
||||
@@ -68,86 +72,3 @@ In actual inference, it's difficult for users to know how to properly configure
|
||||
When `enable_chunked_prefill` is enabled, the service processes long input sequences through dynamic chunking, significantly improving GPU resource utilization. In this mode, the original `max_num_batched_tokens` parameter no longer constrains the batch token count in prefill phase (limiting single prefill token count), thus introducing `max_num_partial_prefills` parameter specifically to limit concurrently processed partial batches.
|
||||
|
||||
To optimize scheduling priority for short requests, new `max_long_partial_prefills` and `long_prefill_token_threshold` parameter combination is added. The former limits the number of long requests in single prefill batch, the latter defines the token threshold for long requests. The system will prioritize batch space for short requests, thereby reducing short request latency in mixed workload scenarios while maintaining stable throughput.
|
||||
|
||||
## 4. GraphOptimizationBackend related configuration parameters
|
||||
Currently, only user configuration of the following parameters is supported:
|
||||
- `use_cudagraph` : bool = False
|
||||
- `graph_optimization_config` : Dict[str, Any]
|
||||
- `graph_opt_level`: int = 0
|
||||
- `use_cudagraph`: bool = False
|
||||
- `cudagraph_capture_sizes` : List[int] = None
|
||||
|
||||
CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts.
|
||||
|
||||
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
|
||||
- `0`: Use Dynamic compute graph, default to 0
|
||||
- `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
|
||||
- `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
|
||||
|
||||
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
|
||||
For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
|
||||
|
||||
When CudaGraph is enabled in the default configuration, a list of Batch Sizes that CudaGraph needs to capture will be automatically set based on the 'max_num_deqs' parameter. The logic for generating the list of Batch Sizes that need to be captured is as follows:
|
||||
|
||||
1. Generate a candidate list with a range of [1,1024] Batch Size.
|
||||
|
||||
```
|
||||
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
|
||||
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
||||
# Batch Size (128, 144, ... 240, 256]
|
||||
candidate_capture_sizes += [16 * i for i in range(9, 17)]
|
||||
# Batch Size (256, 288, ... 992, 1024]
|
||||
candidate_capture_sizes += [32 * i for i in range(17, 33)]
|
||||
```
|
||||
|
||||
2. Crop the candidate list based on the user set 'max_num_deqs' to obtain a CudaGraph capture list with a range of [1,' max_num_deqs'].
|
||||
|
||||
Users can also customize the batch size list that needs to be captured by CudaGraph through the parameter `cudagraph_capture_sizes` in`--graph-optimization-config`:
|
||||
|
||||
```
|
||||
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
|
||||
```
|
||||
|
||||
### CudaGraph related parameters
|
||||
|
||||
Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy:
|
||||
- Additional input Buffer overhead
|
||||
- CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework
|
||||
|
||||
FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter to calculate available memory for `KVCache`, after initializing `KVCache` then uses remaining memory to initialize CudaGraph. Since CudaGraph is not enabled by default currently, using default startup parameters may encounter `Out of memory` errors, can try following solutions:
|
||||
- Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph.
|
||||
- Lower `max_num_seqs` to decrease the maximum concurrency.
|
||||
- Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes`
|
||||
|
||||
- Before use, must ensure loaded model is properly decorated with ```@support_graph_optimization```.
|
||||
|
||||
```python
|
||||
# 1. import decorator
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
# 2. add decorator
|
||||
@support_graph_optimization
|
||||
class Ernie4_5_Model(nn.Layer): # Note decorator is added to nn.Layer subclass
|
||||
...
|
||||
|
||||
# 3. modify parameter passing in ModelForCasualLM subclass's self.model()
|
||||
class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
||||
...
|
||||
def forward(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding, # specify parameter name when passing
|
||||
forward_meta=forward_meta)
|
||||
return hidden_statesfrom fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
@support_graph_optimization
|
||||
class Ernie45TModel(nn.Layer): # Note decorator is added to nn.Layer subclass
|
||||
...
|
||||
```
|
||||
|
||||
- When ```use_cudagraph``` is enabled, currently only supports single-GPU inference, i.e. ```tensor_parallel_size``` set to 1.
|
||||
- When ```use_cudagraph``` is enabled, cannot enable ```enable_prefix_caching``` or ```enable_chunked_prefill```.
|
||||
|
@@ -38,7 +38,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# Whether to use HuggingFace tokenizer (0 or 1)
|
||||
"FD_USE_HF_TOKENIZER":
|
||||
lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
|
||||
lambda: bool(int(os.getenv("FD_USE_HF_TOKENIZER", 0))),
|
||||
|
||||
# ZMQ send high-water mark (HWM) during initialization
|
||||
"FD_ZMQ_SNDHWM":
|
||||
|
@@ -2,11 +2,17 @@
|
||||
|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|Minimum Version Required|
|
||||
|-|-|-|-|-|-|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (Recommended)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|
||||
@@ -83,4 +89,4 @@ for chunk in response:
|
||||
print('\n')
|
||||
```
|
||||
|
||||
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
|
||||
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
|
||||
|
@@ -2,6 +2,7 @@
|
||||
## 一、环境准备
|
||||
### 1.1 支持情况
|
||||
ERNIE-4.5-0.3B 各量化精度,在下列硬件上部署所需要的最小卡数如下:
|
||||
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|-----|-----|-----|-----|
|
||||
|H800 80GB| 1 | 1 | 1 |
|
||||
@@ -24,12 +25,12 @@ ERNIE-4.5-0.3B 各量化精度,在下列硬件上部署所需要的最小卡
|
||||
### 2.1 基础:启动服务
|
||||
通过下列命令启动服务
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-0.3B-Paddle \
|
||||
--tensor-parallel-size 1 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
其中:
|
||||
@@ -75,9 +76,9 @@ CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操
|
||||
--use-cudagraph
|
||||
```
|
||||
注:
|
||||
1. 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../parameters.md) 相关配置参数说明
|
||||
2. 开启CUDAGraph时,暂时只支持单卡推理,即`--tensor-parallel-size 1`
|
||||
3. 开启CUDAGraph时,暂时不支持同时开启`Chunked Prefill`和`Prefix Caching`
|
||||
1. 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明
|
||||
2. 开启CUDAGraph时,如果是TP>1的多卡推理场景,需要同时指定 `--enable-custom-all-reduce`
|
||||
3. 开启CUDAGraph时,暂时不支持`max-model-len > 32768`的场景。
|
||||
|
||||
#### 2.2.5 拒绝采样
|
||||
**原理:**
|
@@ -2,6 +2,7 @@
|
||||
## 一、环境准备
|
||||
### 1.1 支持情况
|
||||
ERNIE-4.5-21B-A3B 各量化精度,在下列硬件上部署所需要的最小卡数如下:
|
||||
|
||||
| | WINT8 | WINT4 | FP8 |
|
||||
|-----|-----|-----|-----|
|
||||
|H800 80GB| 1 | 1 | 1 |
|
||||
@@ -24,12 +25,12 @@ ERNIE-4.5-21B-A3B 各量化精度,在下列硬件上部署所需要的最小
|
||||
### 2.1 基础:启动服务
|
||||
通过下列命令启动服务
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--tensor-parallel-size 1 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
其中:
|
||||
@@ -85,9 +86,9 @@ CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操
|
||||
--use-cudagraph
|
||||
```
|
||||
注:
|
||||
1. 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../parameters.md) 相关配置参数说明
|
||||
2. 开启CUDAGraph时,暂时只支持单卡推理,即`--tensor-parallel-size 1`
|
||||
3. 开启CUDAGraph时,暂时不支持同时开启`Chunked Prefill`和`Prefix Caching`
|
||||
1. 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明
|
||||
2. 开启CUDAGraph时,如果是TP>1的多卡推理场景,需要同时指定 `--enable-custom-all-reduce`
|
||||
3. 开启CUDAGraph时,暂时不支持`max-model-len > 32768`的场景。
|
||||
|
||||
#### 2.2.6 拒绝采样
|
||||
**原理:**
|
@@ -2,6 +2,7 @@
|
||||
## 一、环境准备
|
||||
### 1.1 支持情况
|
||||
ERNIE-4.5-300B-A47B各量化精度,在下列硬件上部署所需要的最小卡数如下:
|
||||
|
||||
| | WINT8 | WINT4 | FP8 | WINT2 | W4A8 |
|
||||
|-----|-----|-----|-----|-----|-----|
|
||||
|H800 80GB| 8 | 4 | 8 | 2 | 4 |
|
||||
@@ -21,12 +22,12 @@ ERNIE-4.5-300B-A47B各量化精度,在下列硬件上部署所需要的最小
|
||||
### 2.1 基础:启动服务
|
||||
通过下列命令启动服务
|
||||
```bash
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--tensor-parallel-size 8 \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--kv-cache-ratio 0.75 \
|
||||
--max-num-seqs 128
|
||||
```
|
||||
其中:
|
||||
@@ -124,5 +125,20 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--splitwise-role "decode"
|
||||
```
|
||||
|
||||
#### 2.2.8 CUDAGraph
|
||||
**原理:**
|
||||
CUDAGraph 是 NVIDIA 提供的一项 GPU 计算加速技术,通过将 CUDA 操作序列捕获(capture)为图结构(graph),实现 GPU 任务的高效执行和优化。CUDAGraph 的核心思想是将一系列 GPU 计算和内存操作封装为一个可重复执行的图,从而减少 CPU-GPU 通信开销、降低内核启动延迟,并提升整体计算性能。
|
||||
|
||||
**启用方式:**
|
||||
在启动命令中增加
|
||||
```
|
||||
--use-cudagraph
|
||||
--enable-custom-all-reduce
|
||||
```
|
||||
注:
|
||||
1. 通常情况下不需要额外设置其他参数,但CUDAGraph会产生一些额外的显存开销,在一些显存受限的场景下可能需要调整。详细的参数调整请参考[GraphOptimizationBackend](../features/graph_optimization.md) 相关配置参数说明
|
||||
2. 开启CUDAGraph时,如果是TP>1的多卡推理场景,需要同时指定 `--enable-custom-all-reduce`
|
||||
3. 开启CUDAGraph时,暂时不支持`max-model-len > 32768`的场景。
|
||||
|
||||
## 三、常见问题FAQ
|
||||
如果您在使用过程中遇到问题,可以在[FAQ](./FAQ.md)中查阅。
|
134
docs/zh/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md
Normal file
134
docs/zh/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md
Normal file
@@ -0,0 +1,134 @@
|
||||
|
||||
# ERNIE-4.5-VL-28B-A3B-Paddle
|
||||
|
||||
## 一、环境准备
|
||||
### 1.1 支持情况
|
||||
在下列硬件上部署所需要的最小卡数如下:
|
||||
|
||||
| 设备[显存] | WINT4 | WINT8 | BFLOAT16 |
|
||||
|:----------:|:----------:|:------:| :------:|
|
||||
| A30 [24G] | 2 | 2 | 4 |
|
||||
| L20 [48G] | 1 | 1 | 2 |
|
||||
| H20 [144G] | 1 | 1 | 1 |
|
||||
| A100 [80G] | 1 | 1 | 1 |
|
||||
| H800 [80G] | 1 | 1 | 1 |
|
||||
|
||||
### 1.2 安装fastdeploy
|
||||
|
||||
安装流程参考文档 [FastDeploy GPU 安装](../get_started/installation/nvidia_gpu.md)
|
||||
|
||||
> ⚠️ 注意事项
|
||||
> - FastDeploy只支持Paddle格式的模型,注意下载Paddle后缀的模型
|
||||
> - 使用模型名称会自动下载模型,如果已经下载过模型,可以直接使用模型下载位置的绝对路径
|
||||
|
||||
## 二、如何使用
|
||||
### 2.1 基础:启动服务
|
||||
**示例1:** 4090上单卡部署32K上下文的服务
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
**示例2:** H800上双卡部署128K上下文的服务
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 128 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
> ⚠️ 2.1及以上版本需要通过环境变量开启新调度器 `ENABLE_V1_KVCACHE_SCHEDULER=1`,否则可能会有部分请求最大长度前截断或返空。
|
||||
|
||||
示例是可以稳定运行的一组配置,同时也能得到比较好的性能。
|
||||
如果对精度、性能有进一步的要求,请继续阅读下面的内容。
|
||||
### 2.2 进阶:如何获取更优性能
|
||||
|
||||
#### 2.2.1 评估应用场景,正确设置参数
|
||||
> **上下文长度**
|
||||
- **参数:** `--max-model-len`
|
||||
- **描述:** 控制模型可处理的最大上下文长度。
|
||||
- **推荐:** 更长的上下文会导致吞吐降低,根据实际情况设置,`ERNIE-4.5-VL-28B-A3B-Paddle`最长支持**128k**(131072)长度的上下文。
|
||||
|
||||
⚠️ 注:更长的上下文会显著增加GPU显存需求,设置更长的上下文之前确保硬件资源是满足的。
|
||||
> **最大序列数量**
|
||||
- **参数:** `--max-num-seqs`
|
||||
- **描述:** 控制服务可以处理的最大序列数量,支持1~256。
|
||||
- **推荐:** 如果您不知道实际应用场景中请求的平均序列数量是多少,我们建议设置为**256**。如果您的应用场景中请求的平均序列数量明显少于256,我们建议设置为一个略大于平均值的较小值,以进一步降低显存占用,优化服务性能。
|
||||
|
||||
> **多图、多视频输入**
|
||||
- **参数**:`--limit-mm-per-prompt`
|
||||
- **描述**:我们的模型支持单次提示词(prompt)中输入多张图片和视频。请使用此参数限制每次请求的图片/视频数量,以确保资源高效利用。
|
||||
- **推荐**:我们建议将单次提示词(prompt)中的图片和视频数量均设置为100个,以平衡性能与内存占用。
|
||||
|
||||
> **初始化时可用的显存比例**
|
||||
- **参数:** `--gpu-memory-utilization`
|
||||
- **用处:** 用于控制 FastDeploy 初始化服务的可用显存,默认0.9,即预留10%的显存备用。
|
||||
- **推荐:** 推荐使用默认值0.9。如果服务压测时提示显存不足,可以尝试调低该值。
|
||||
|
||||
#### 2.2.2 Chunked Prefill
|
||||
- **参数:** `--enable-chunked-prefill`
|
||||
- **用处:** 开启 `chunked prefill` 可**降低显存峰值**并**提升服务吞吐**。
|
||||
|
||||
- **其他相关配置**:
|
||||
|
||||
`--max-num-batched-tokens`:限制每个chunk的最大token数量。多模场景下每个chunk会向上取整保持图片的完整性,因此实际每次推理的总token数会大于该值。我们推荐设置为384。
|
||||
|
||||
#### 2.2.3 **量化精度**
|
||||
- **参数:** `--quantization`
|
||||
|
||||
- **已支持的精度类型:**
|
||||
- WINT4 (适合大多数用户)
|
||||
- WINT8
|
||||
- BFLOAT16 (未设置 `--quantization` 参数时,默认使用BFLOAT16)
|
||||
|
||||
- **推荐:**
|
||||
- 除非您有极其严格的精度要求,否则我们建议使用WINT4量化。这将显著降低内存占用并提升吞吐量。
|
||||
- 若需要稍高的精度,可尝试WINT8。
|
||||
- 仅当您的应用场景对精度有极致要求时候才尝试使用BFLOAT16,因为它需要更多显存。
|
||||
|
||||
#### 2.2.4 **可调整的环境变量**
|
||||
> **拒绝采样:**`FD_SAMPLING_CLASS=rejection`
|
||||
- **描述**:拒绝采样即从一个易于采样的提议分布(proposal distribution)中生成样本,避免显式排序从而达到提升采样速度的效果,可以提升推理性能。
|
||||
- **推荐**:这是一种影响效果的较为激进的优化策略,我们还在全面验证影响。如果对性能有较高要求,也可以接受对效果的影响时可以尝试开启。
|
||||
|
||||
> **Attention超参:**`FLAGS_max_partition_size=1024`
|
||||
- **描述**:Append Attntion(默认)后端的超参,我们在常用数据集上的测试结果表明,设置为1024后可以大幅提升解码速度,尤其是长文场景。
|
||||
- **推荐**:未来会修改为自动调整的机制。如果对性能有较高要求可以尝试开启。
|
||||
|
||||
## 三、常见问题FAQ
|
||||
**注意:** 使用多模服务部署需要在配置中添加参数 `--enable-mm`。
|
||||
|
||||
### 3.1 显存不足(OOM)
|
||||
如果服务启动时提示显存不足,请尝试以下方法:
|
||||
1. 确保无其他进程占用显卡显存;
|
||||
2. 使用WINT4/WINT8量化,开启chunked prefill;
|
||||
3. 酌情降低上下文长度和最大序列数量;
|
||||
4. 增加部署卡数,使用2卡或4卡部署,即修改参数 `--tensor-parallel-size 2` 或 `--tensor-parallel-size 4`。
|
||||
|
||||
如果可以服务可以正常启动,运行时提示显存不足,请尝试以下方法:
|
||||
1. 酌情降低初始化时可用的显存比例,即调整参数 `--gpu-memory-utilization` 的值;
|
||||
2. 增加部署卡数,参数修改同上。
|
109
docs/zh/best_practices/ERNIE-4.5-VL-424B-A47B-Paddle.md
Normal file
109
docs/zh/best_practices/ERNIE-4.5-VL-424B-A47B-Paddle.md
Normal file
@@ -0,0 +1,109 @@
|
||||
|
||||
# ERNIE-4.5-VL-424B-A47B-Paddle
|
||||
|
||||
## 一、环境准备
|
||||
### 1.1 支持情况
|
||||
在下列硬件上部署所需要的最小卡数如下:
|
||||
|
||||
| 设备[显存] | WINT4 | WINT8 | BFLOAT16 |
|
||||
|:----------:|:----------:|:------:| :------:|
|
||||
| H20 [144G] | 8 | 8 | 8 |
|
||||
| A100 [80G] | 8 | 8 | - |
|
||||
| H800 [80G] | 8 | 8 | - |
|
||||
|
||||
### 1.2 安装fastdeploy
|
||||
|
||||
安装流程参考文档 [FastDeploy GPU 安装](../get_started/installation/nvidia_gpu.md)
|
||||
|
||||
> ⚠️ 注意事项
|
||||
> - FastDeploy只支持Paddle格式的模型,注意下载Paddle后缀的模型
|
||||
> - 使用模型名称会自动下载模型,如果已经下载过模型,可以直接使用模型下载位置的绝对路径
|
||||
|
||||
## 二、如何使用
|
||||
### 2.1 基础:启动服务
|
||||
**示例1:** H800上8卡部署128K上下文的服务
|
||||
```shell
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-424B-A47B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--tensor-parallel-size 8 \
|
||||
--max-model-len 131072 \
|
||||
--max-num-seqs 16 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 384 \
|
||||
--quantization wint4 \
|
||||
--enable-mm
|
||||
```
|
||||
> ⚠️ 2.1及以上版本需要通过环境变量开启新调度器 `ENABLE_V1_KVCACHE_SCHEDULER=1`,否则可能会有部分请求最大长度前截断或返空。
|
||||
|
||||
示例是可以稳定运行的一组配置,同时也能得到比较好的性能。
|
||||
如果对精度、性能有进一步的要求,请继续阅读下面的内容。
|
||||
### 2.2 进阶:如何获取更优性能
|
||||
|
||||
#### 2.2.1 评估应用场景,正确设置参数
|
||||
> **上下文长度**
|
||||
- **参数:** `--max-model-len`
|
||||
- **描述:** 控制模型可处理的最大上下文长度。
|
||||
- **推荐:** 更长的上下文会导致吞吐降低,根据实际情况设置,`ERNIE-4.5-VL-424B-A47B-Paddle` 最长支持**128k**(131072)长度的上下文。
|
||||
|
||||
> **最大序列数量**
|
||||
- **参数:** `--max-num-seqs`
|
||||
- **描述:** 控制服务可以处理的最大序列数量,支持1~256。
|
||||
- **推荐:** 128k场景下,80G显存的单机我们建议设置为**16**。
|
||||
|
||||
> **多图、多视频输入**
|
||||
- **参数**:`--limit-mm-per-prompt`
|
||||
- **描述**:我们的模型支持单次提示词(prompt)中输入多张图片和视频。请使用此参数限制每次请求的图片/视频数量,以确保资源高效利用。
|
||||
- **推荐**:我们建议将单次提示词(prompt)中的图片和视频数量均设置为100个,以平衡性能与内存占用。
|
||||
|
||||
> **初始化时可用的显存比例**
|
||||
- **参数:** `--gpu-memory-utilization`
|
||||
- **用处:** 用于控制 FastDeploy 初始化服务的可用显存,默认0.9,即预留10%的显存备用。
|
||||
- **推荐:** 128k长度的上下文时推荐使用0.8。如果服务压测时提示显存不足,可以尝试调低该值。
|
||||
|
||||
#### 2.2.2 Chunked Prefill
|
||||
- **参数:** `--enable-chunked-prefill`
|
||||
- **用处:** 开启 `chunked prefill` 可**降低显存峰值**并**提升服务吞吐**。
|
||||
|
||||
- **其他相关配置**:
|
||||
|
||||
`--max-num-batched-tokens`:限制每个chunk的最大token数量。多模场景下每个chunk会向上取整保持图片的完整性,因此实际每次推理的总token数会大于该值。推荐设置为384。
|
||||
|
||||
#### 2.2.3 **量化精度**
|
||||
- **参数:** `--quantization`
|
||||
|
||||
- **已支持的精度类型:**
|
||||
- WINT4 (适合大多数用户)
|
||||
- WINT8
|
||||
- BFLOAT16 (未设置 `--quantization` 参数时,默认使用BFLOAT16)
|
||||
|
||||
- **推荐:**
|
||||
- 除非您有极其严格的精度要求,否则我们建议使用WINT4量化。这将显著降低内存占用并提升吞吐量。
|
||||
- 若需要稍高的精度,可尝试WINT8。
|
||||
- 仅当您的应用场景对精度有极致要求时候才尝试使用BFLOAT16,因为它需要更多显存。
|
||||
|
||||
#### 2.2.4 **可调整的环境变量**
|
||||
> **拒绝采样:**`FD_SAMPLING_CLASS=rejection`
|
||||
- **描述**:拒绝采样即从一个易于采样的提议分布(proposal distribution)中生成样本,避免显式排序从而达到提升采样速度的效果,可以提升推理性能。
|
||||
- **推荐**:这是一种影响效果的较为激进的优化策略,我们还在全面验证影响。如果对性能有较高要求,也可以接受对效果的影响时可以尝试开启。
|
||||
|
||||
> **Attention超参:**`FLAGS_max_partition_size=1024`
|
||||
- **描述**:Append Attntion(默认)后端的超参,我们在常用数据集上的测试结果表明,设置为1024后可以大幅提升解码速度,尤其是长文场景。
|
||||
- **推荐**:未来会修改为自动调整的机制。如果对性能有较高要求可以尝试开启。
|
||||
|
||||
## 三、常见问题FAQ
|
||||
**注意:** 使用多模服务部署需要在配置中添加参数 `--enable-mm`。
|
||||
|
||||
### 3.1 显存不足(OOM)
|
||||
如果服务启动时提示显存不足,请尝试以下方法:
|
||||
1. 确保无其他进程占用显卡显存;
|
||||
2. 使用WINT4/WINT8量化,开启chunked prefill;
|
||||
3. 酌情降低上下文长度和最大序列数量。
|
||||
|
||||
如果可以服务可以正常启动,运行时提示显存不足,请尝试以下方法:
|
||||
1. 酌情降低初始化时可用的显存比例,即调整参数 `--gpu-memory-utilization` 的值。
|
7
docs/zh/best_practices/README.md
Normal file
7
docs/zh/best_practices/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# 最佳实践
|
||||
|
||||
- [ERNIE-4.5-0.3B-Paddle.md](ERNIE-4.5-0.3B-Paddle.md)
|
||||
- [ERNIE-4.5-21B-A3B-Paddle.md](ERNIE-4.5-21B-A3B-Paddle.md)
|
||||
- [ERNIE-4.5-300B-A47B-Paddle.md](ERNIE-4.5-300B-A47B-Paddle.md)
|
||||
- [ERNIE-4.5-VL-28B-A3B-Paddle](ERNIE-4.5-VL-28B-A3B-Paddle.md)
|
||||
- [ERNIE-4.5-VL-424B-A47B-Paddle](ERNIE-4.5-VL-424B-A47B-Paddle.md)
|
119
docs/zh/features/graph_optimization.md
Normal file
119
docs/zh/features/graph_optimization.md
Normal file
@@ -0,0 +1,119 @@
|
||||
# FastDeploy 中的图优化技术
|
||||
FastDeploy 的 `GraphOptimizationBackend` 中集成了多种图优化技术:
|
||||
|
||||
+ **CUDA Graph**:一种通过单个 CPU 操作启动多个 GPU 操作的机制,可以降低开销并提高性能
|
||||
|
||||
+ **动态图转静态图**:将动态图转换为静态图,利用全局图结构信息优化计算图、提升执行效率
|
||||
|
||||
+ **CINN 神经网络编译器**:在静态图的基础上执行 IR 转换、Kernel 融合、Kernel 生成等计算图编译优化方法,实现综合优化
|
||||
|
||||
任何依赖数据的控制流、Host-Device 同步、地址/形状变化的模型输入、动态的 Kernel 执行配置等动态情况都会导致 CUDAGraph Capture/Replay 失败,而大模型推理中面临场景的是动态的输入长度、动态的 Batch Size,灵活的 Attention 实现和多卡通信,导致 CUDA Graph 难以应用。
|
||||
|
||||
开源主流方案基于静态图实现 CUDA Graph,技术栈较深。FastDeploy 不仅支持静态图、神经网络编译器、CUDAGraph 组合优化,还支持直接在动态图中应用 CUDA Graph ,开发成本更低,但面临的动态情况更复杂。
|
||||
|
||||
FastDeploy 的 `GraphOptimizationBackend` 设计架构如下,**部分功能仍在开发中,建议仔细阅读第一章节使用限制**。
|
||||
|
||||

|
||||
|
||||
## 1. GraphOptimizationBackend 当前使用限制
|
||||
### 1.1 多卡场景需要开启 Custom all-reduce
|
||||
在 CUDAGraph 多卡推理任务中需要使用 Custom all-reduce 算子进行多卡 all-reduce,
|
||||
|
||||
在 2.2 版本之前,CUDAGraph 和 Custom all-reduce 算子都未默认开启,需要在启动命令中添加 `--enable-custom-all-reduce` 手动开启。
|
||||
|
||||
### 1.2 FLAGS_max_partition_size 相关的 Kernel 的动态执行配置导致 CUDAGraph 执行失败
|
||||
`FLAGS_max_partition_size` 环境变量控制了 CascadeAppend Attention 中 Kernel 的`gridDim` 执行配置 , 而动态的执行配置会导致 CUDAGraph 执行失败。
|
||||
|
||||
[PR#3223](https://github.com/PaddlePaddle/FastDeploy/pull/3223) 修复了这个问题,但在 2.2 之前的 Release 版本依然存在这个问题。
|
||||
|
||||
**问题自查方法:**
|
||||
+ 根据`FLAGS_max_partition_size`的值(默认是 32K)和启动参数中的 `max_model_len`计算`div_up(max_model_len, max_partition_size)`,结果大于`1`时无法执行,等于`1`时可以正常运行
|
||||
|
||||
**解决方法:**
|
||||
1. 调整`FLAGS_max_partition_size`和`max_model_len`的值,不触发动态执行配置。
|
||||
2. 关闭 CUDAGraph
|
||||
|
||||
## 2. GraphOptimizationBackend 相关配置参数说明
|
||||
|
||||
当前仅支持用户配置以下参数:
|
||||
|
||||
+ `use_cudagraph` : bool = False
|
||||
+ `graph_optimization_config` : Dict[str, Any]
|
||||
+ `graph_opt_level`: int = 0
|
||||
+ `use_cudagraph`: bool = False
|
||||
+ `cudagraph_capture_sizes` : List[int] = None
|
||||
|
||||
可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。
|
||||
|
||||
`--graph-optimization-config` 中的 `graph_opt_level` 参数用于配置图优化等级,可选项如下:
|
||||
|
||||
+ `0`: 动态图,默认为 0
|
||||
+ `1`: 静态图,初始化阶段会使用 Paddle API 将动态图转换为静态图
|
||||
+ `2`: 在静态图的基础上,使用 Paddle 框架编译器(CINN, Compiler Infrastructure for Neural Networks)进行编译优化
|
||||
|
||||
一般情况下静态图比动态图的 Kernel Launch 开销更小,推荐使用静态图。
|
||||
对于已适配的模型,FastDeploy 的 CudaGraph **可同时支持动态图与静态图**。
|
||||
|
||||
在默认配置下开启 CudaGraph 时,会根据 `max_num_seqs` 参数自动设置 CudaGraph 需要捕获的 Batch Size 列表,需要捕获的 Batch Size 的列表自动生成逻辑如下:
|
||||
|
||||
1. 生成一个范围为 [1,1024] Batch Size 的候选列表
|
||||
|
||||
```
|
||||
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
|
||||
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
||||
# Batch Size (128, 144, ... 240, 256]
|
||||
candidate_capture_sizes += [16 * i for i in range(9, 17)]
|
||||
# Batch Size (256, 288, ... 992, 1024]
|
||||
candidate_capture_sizes += [32 * i for i in range(17, 33)]
|
||||
```
|
||||
|
||||
2. 根据用户设置的 `max_num_seqs` 裁剪候选列表,得到范围为 [1, `max_num_seqs`] 的 CudaGraph 捕获列表。
|
||||
|
||||
用户也可以通过 `--graph-optimization-config` 中的 `cudagraph_capture_sizes` 参数自定义需要被 CudaGraph 捕获的 Batch Size 列表:
|
||||
|
||||
```
|
||||
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
|
||||
```
|
||||
|
||||
### 2.1 CudaGraph相关参数说明
|
||||
|
||||
使用 CudaGraph 会产生一些额外的显存开销,在FastDeploy中分为下面两类:
|
||||
|
||||
+ 额外的输入 Buffer 开销
|
||||
+ CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存
|
||||
|
||||
FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存,初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的,因此使用默认启动参数可能会遇到 `Out Of Memory` 错误,可以尝试使用下面三种方式解决:
|
||||
|
||||
+ 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。
|
||||
+ 调低 `max_num_seqs` 的值,降低最大并发数。
|
||||
+ 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量
|
||||
|
||||
使用CudaGraph之前,需要确保加载的模型被装饰器 ``@support_graph_optimization``正确修饰。
|
||||
|
||||
```python
|
||||
# 1. import 装饰器
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
# 2. 添加装饰器
|
||||
@support_graph_optimization
|
||||
class Ernie4_5_Model(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上
|
||||
...
|
||||
|
||||
# 3. 修改 ModelForCasualLM 子类中 self.model() 的传参方式
|
||||
class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
||||
...
|
||||
def forward(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding, # 传参时指定参数名
|
||||
forward_meta=forward_meta)
|
||||
return hidden_statesfrom fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
@support_graph_optimization
|
||||
class Ernie45TModel(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上
|
||||
...
|
||||
```
|
1
docs/zh/features/images/GraphOptBackendArch.svg
Normal file
1
docs/zh/features/images/GraphOptBackendArch.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 30 KiB |
73
docs/zh/features/multi-node_deployment.md
Normal file
73
docs/zh/features/multi-node_deployment.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# 多节点部署
|
||||
|
||||
## 概述
|
||||
多节点部署旨在解决单个机器GPU显存不足时,支持跨多台机器的张量并行执行。
|
||||
|
||||
## 环境准备
|
||||
#### 网络要求
|
||||
1. 所有节点必须在同一本地网络中
|
||||
2. 确保所有节点之间双向连通(可使用`ping`和`nc -zv`测试)
|
||||
|
||||
|
||||
#### 软件要求
|
||||
1. 所有节点安装相同版本的FastDeploy
|
||||
2. [建议安装]安装并配置MPI(OpenMPI或MPICH)
|
||||
|
||||
## 张量并行部署
|
||||
|
||||
### 推荐启动方式
|
||||
我们推荐使用mpirun进行一键启动,无需手动启动每个节点。
|
||||
|
||||
### 使用说明
|
||||
1. 在所有机器上执行相同的命令
|
||||
2. `ips`参数中的IP顺序决定了节点启动顺序
|
||||
3. 第一个IP将被指定为主节点
|
||||
4. 确保所有节点能够解析彼此的主机名
|
||||
|
||||
* 在线推理启动示例:
|
||||
```shell
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8180 \
|
||||
--metrics-port 8181 \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--tensor-parallel-size 16 \
|
||||
--ips 192.168.1.101,192.168.1.102
|
||||
```
|
||||
|
||||
* 离线启动示例:
|
||||
```python
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
|
||||
model_name_or_path = "baidu/ERNIE-4.5-300B-A47B-Paddle"
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
|
||||
llm = LLM(model=model_name_or_path, tensor_parallel_size=16, ips="192.168.1.101,192.168.1.102")
|
||||
if llm._check_master():
|
||||
output = llm.generate(prompts="你是谁?", use_tqdm=True, sampling_params=sampling_params)
|
||||
print(output)
|
||||
```
|
||||
|
||||
* 注意:
|
||||
- 只有主节点可以接收完成请求
|
||||
- 请始终将请求发送到主节点(ips列表中的第一个IP)
|
||||
- 主节点将在所有节点间分配工作负载
|
||||
|
||||
### 参数说明
|
||||
|
||||
#### `ips`参数
|
||||
- **类型**: `字符串`
|
||||
- **格式**: 逗号分隔的IPv4地址
|
||||
- **描述**: 指定部署组中所有节点的IP地址
|
||||
- **必填**: 仅多节点部署时需要
|
||||
- **示例**: `"192.168.1.101,192.168.1.102,192.168.1.103"`
|
||||
|
||||
#### `tensor_parallel_size`参数
|
||||
- **类型**: `整数`
|
||||
- **描述**: 所有节点上的GPU总数
|
||||
- **必填**: 是
|
||||
- **示例**: 对于2个节点各8个GPU,设置为16
|
||||
|
85
docs/zh/features/plugins.md
Normal file
85
docs/zh/features/plugins.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# FastDeploy 插件机制说明文档
|
||||
|
||||
FastDeploy 支持插件机制,允许用户在不修改核心代码的前提下扩展功能。插件通过 Python 的 `entry_points` 机制实现自动发现与加载。
|
||||
|
||||
## 插件工作原理
|
||||
|
||||
插件本质上是在 FastDeploy 启动时被自动调用的注册函数。系统使用 `load_plugins_by_group` 函数确保所有进程(包括分布式训练场景下的子进程)在正式运行前都已加载所需的插件。
|
||||
|
||||
## 插件发现机制
|
||||
|
||||
FastDeploy 利用 Python 的 `entry_points` 机制来发现并加载插件。开发者需在自己的项目中将插件注册到指定的 entry point 组中。
|
||||
|
||||
### 示例:创建一个插件
|
||||
|
||||
#### 1. 编写插件逻辑
|
||||
|
||||
假设你有一个自定义模型类 `MyModelForCasualLM` 和预训练类 `MyPretrainedModel`,你可以编写如下注册函数:
|
||||
|
||||
```python
|
||||
# 文件:fd_add_dummy_model/__init__.py
|
||||
from fastdeploy.model_registry import ModelRegistry
|
||||
from my_custom_model import MyModelForCasualLM, MyPretrainedModel
|
||||
|
||||
def register():
|
||||
if "MyModelForCasualLM" not in ModelRegistry.get_supported_archs():
|
||||
ModelRegistry.register_model_class(MyModelForCasualLM)
|
||||
ModelRegistry.register_pretrained_model(MyPretrainedModel)
|
||||
```
|
||||
|
||||
#### 2. 注册插件到 `setup.py`
|
||||
|
||||
```python
|
||||
# setup.py
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="fastdeploy-plugins",
|
||||
version="0.1",
|
||||
packages=["fd_add_dummy_model"],
|
||||
entry_points={
|
||||
"fastdeploy.model_register_plugins": [
|
||||
"fd_add_dummy_model = fd_add_dummy_model:register",
|
||||
],
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
## 插件结构说明
|
||||
|
||||
插件由三部分组成:
|
||||
|
||||
| 组件 | 说明 |
|
||||
|------|------|
|
||||
| **插件组(Group)** | 插件所属的功能分组,例如:<br> - `fastdeploy.model_register_plugins`: 用于注册模型<br> - `fastdeploy.model_runner_plugins`: 用于注册模型运行器<br> 用户可根据需要自定义分组。 |
|
||||
| **插件名(Name)** | 每个插件的唯一标识名(如 `fd_add_dummy_model`),可通过环境变量 `FD_PLUGINS` 控制是否加载该插件。 |
|
||||
| **插件值(Value)** | 格式为 `模块名:函数名`,指向实际执行注册逻辑的入口函数。 |
|
||||
|
||||
## 控制插件加载行为
|
||||
|
||||
默认情况下,FastDeploy 会加载所有已注册的插件。若只想加载特定插件,可以设置环境变量:
|
||||
|
||||
```bash
|
||||
export FD_PLUGINS=fastdeploy-plugins
|
||||
```
|
||||
|
||||
多个插件名之间可以用逗号分隔:
|
||||
|
||||
```bash
|
||||
export FD_PLUGINS=plugin_a,plugin_b
|
||||
```
|
||||
|
||||
## 参考示例
|
||||
|
||||
请参见项目目录下的示例插件实现:
|
||||
```
|
||||
./test/plugins/
|
||||
```
|
||||
|
||||
其中包含完整的插件结构和 `setup.py` 配置示例。
|
||||
|
||||
## 总结
|
||||
|
||||
通过插件机制,用户可以轻松地为 FastDeploy 添加自定义模型或功能模块,而无需修改核心源码。这不仅提升了系统的可扩展性,也方便了第三方开发者进行功能拓展。
|
||||
|
||||
如需进一步开发插件,请参考 FastDeploy 源码中的 `model_registry` 和 `plugin_loader` 模块。
|
@@ -98,7 +98,7 @@ curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \
|
||||
{"role": "user", "content": "How old are you"}
|
||||
],
|
||||
"top_p": 0.8,
|
||||
"top_k": 50
|
||||
"top_k": 20
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -118,7 +118,7 @@ response = client.chat.completions.create(
|
||||
],
|
||||
stream=True,
|
||||
top_p=0.8,
|
||||
extra_body={"top_k": 50}
|
||||
extra_body={"top_k": 20}
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta:
|
||||
@@ -161,8 +161,7 @@ response = client.chat.completions.create(
|
||||
],
|
||||
stream=True,
|
||||
top_p=0.8,
|
||||
extra_body={"top_k": 20},
|
||||
min_p=0.1
|
||||
extra_body={"top_k": 20, "min_p": 0.1}
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk.choices[0].delta:
|
||||
|
@@ -23,6 +23,7 @@
|
||||
**注意**: 由于模型参数量为424B-A47B,在80G * 8卡的机器上,需指定```--quantization wint4```(wint8也可部署)。
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-424B-A47B-Paddle \
|
||||
--port 8180 --engine-worker-queue-port 8181 \
|
||||
@@ -31,7 +32,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--quantization wint4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--enable-mm \
|
||||
--mm-processor-kwargs '{"video_max_frames": 30}' \
|
||||
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
|
||||
--reasoning-parser ernie-45-vl
|
||||
|
@@ -21,6 +21,7 @@
|
||||
执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md)。
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port 8180 --engine-worker-queue-port 8181 \
|
||||
|
@@ -1,6 +1,6 @@
|
||||
# FastDeploy Installation Guide
|
||||
# FastDeploy 安装
|
||||
|
||||
FastDeploy currently supports installation on the following hardware platforms:
|
||||
FastDeploy支持如下硬件平台:
|
||||
|
||||
- [NVIDIA GPU Installation](nvidia_gpu.md)
|
||||
- [Kunlunxin XPU Installation](kunlunxin_xpu.md)
|
||||
|
@@ -3,6 +3,7 @@
|
||||
|
||||
## 准备机器
|
||||
首先您需要准备以下配置的机器
|
||||
|
||||
| CPU | 内存 | 天数 | 硬盘|
|
||||
|-----|------|-----|-----|
|
||||
| x86 | 1TB| 8xBI150| 1TB|
|
||||
|
@@ -5,8 +5,8 @@
|
||||
- OS:Linux
|
||||
- Python:3.10
|
||||
- XPU 型号:P800
|
||||
- XPU 驱动版本:≥ 5.0.21.10
|
||||
- XPU 固件版本:≥ 1.31
|
||||
- XPU 驱动版本:≥ 5.0.21.26
|
||||
- XPU 固件版本:≥ 1.48
|
||||
|
||||
已验证的平台:
|
||||
- CPU:INTEL(R) XEON(R) PLATINUM 8563C / Hygon C86-4G 7490 64-core Processor
|
||||
@@ -15,8 +15,8 @@
|
||||
- OS:CentOS release 7.6 (Final)
|
||||
- Python:3.10
|
||||
- XPU 型号:P800(OAM 版)
|
||||
- XPU 驱动版本:5.0.21.10
|
||||
- XPU 固件版本:1.31
|
||||
- XPU 驱动版本:5.0.21.26
|
||||
- XPU 固件版本:1.48
|
||||
|
||||
**注:** 目前只验证过 INTEL 或海光 CPU OAM 版 P800 服务器,暂未验证其它 CPU 和 PCIe 版 P800 服务器。
|
||||
|
||||
@@ -25,9 +25,9 @@
|
||||
```bash
|
||||
mkdir Work
|
||||
cd Work
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
|
||||
docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
|
||||
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \
|
||||
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \
|
||||
/bin/bash
|
||||
docker exec -it fastdeploy-xpu /bin/bash
|
||||
```
|
||||
@@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash
|
||||
### 安装 PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
```
|
||||
|
||||
或者您也可以安装最新版 PaddlePaddle(不推荐)
|
||||
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
|
||||
### 安装 FastDeploy(**注意不要通过 pypi 源安装**)
|
||||
|
||||
```bash
|
||||
python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||
```
|
||||
|
||||
或者你也可以安装最新版 FastDeploy(不推荐)
|
||||
@@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa
|
||||
### 安装 PaddlePaddle
|
||||
|
||||
```bash
|
||||
python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||
```
|
||||
|
||||
或者您也可以安装最新版 PaddlePaddle(不推荐)
|
||||
|
@@ -15,7 +15,7 @@
|
||||
**注意**: 如下镜像仅支持SM 80/90架构GPU(A800/H800等),如果你是在L20/L40/4090等SM 86/69架构的GPU上部署,请在创建容器后,卸载```fastdeploy-gpu```再重新安装如下文档指定支持86/89架构的`fastdeploy-gpu`包。
|
||||
|
||||
``` shell
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.0.0
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.1.0
|
||||
```
|
||||
|
||||
## 2. 预编译Pip安装
|
||||
@@ -23,7 +23,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12
|
||||
首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)
|
||||
|
||||
``` shell
|
||||
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
```
|
||||
|
||||
再安装 fastdeploy,**注意不要通过pypi源安装**,需要通过如下方式安装
|
||||
@@ -64,7 +64,7 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu .
|
||||
首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/)
|
||||
|
||||
``` shell
|
||||
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
python -m pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
|
||||
```
|
||||
|
||||
接着克隆源代码,编译安装
|
||||
|
@@ -17,6 +17,7 @@
|
||||
安装FastDeploy后,在终端执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md)
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-0.3B-Paddle \
|
||||
--port 8180 \
|
||||
|
@@ -19,6 +19,7 @@
|
||||
安装FastDeploy后,在终端执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md)
|
||||
|
||||
```shell
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
@@ -26,8 +27,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--engine-worker-queue-port 8182 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--enable-mm
|
||||
--reasoning-parser ernie-45-vl
|
||||
```
|
||||
|
||||
>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Base-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
|
||||
|
@@ -13,12 +13,12 @@
|
||||
|
||||
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|
||||
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
|
||||
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅| WIP |128K |
|
||||
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|❌| WIP | 128K |
|
||||
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ✅ | ✅|128K |
|
||||
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅|128K |
|
||||
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
|
||||
|
||||
## 文档说明
|
||||
|
@@ -39,7 +39,7 @@ for output in outputs:
|
||||
```python
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
# 加载模型
|
||||
llm = LLM(model="baidu/ERNIE-4.5-VL-28B-A3B-Paddle", tensor_parallel_size=1, max_model_len=32768, enable_mm=True, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
llm = LLM(model="baidu/ERNIE-4.5-VL-28B-A3B-Paddle", tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
|
||||
outputs = llm.chat(
|
||||
messages=[
|
||||
@@ -127,7 +127,7 @@ for message in messages:
|
||||
})
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
|
||||
llm = LLM(model=PATH, tensor_parallel_size=1, max_model_len=32768, enable_mm=True, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
llm = LLM(model=PATH, tensor_parallel_size=1, max_model_len=32768, limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
|
||||
outputs = llm.generate(prompts={
|
||||
"prompt": prompt,
|
||||
"multimodal_data": {
|
||||
@@ -183,6 +183,7 @@ for output in outputs:
|
||||
* min_p(float): token入选的最小概率阈值(相对于最高概率token的比值,设为>0可通过过滤低概率token来提升文本生成质量)
|
||||
* max_tokens(int): 限制模型生成的最大token数量(包括输入和输出)
|
||||
* min_tokens(int): 强制模型生成的最少token数量,避免过早结束
|
||||
* bad_words(list[str]): 禁止生成的词列表, 防止模型生成不希望出现的词
|
||||
|
||||
### 2.5 fastdeploy.engine.request.RequestOutput
|
||||
|
||||
|
@@ -6,6 +6,8 @@
|
||||
|:-----------------------------------|:----------| :----- |
|
||||
| ```port``` | `int` | 仅服务化部署需配置,服务HTTP请求端口号,默认8000 |
|
||||
| ```metrics_port``` | `int` | 仅服务化部署需配置,服务监控Metrics端口号,默认8001 |
|
||||
| ```max_waiting_time``` | `int` | 仅服务化部署需配置,服务请求建立连接最大等待时间,默认-1 表示无等待时间限制|
|
||||
| ```max_concurrency``` | `int` | 仅服务化部署需配置,服务实际建立连接数目,默认512 |
|
||||
| ```engine_worker_queue_port``` | `int` | FastDeploy内部引擎进程通信端口, 默认8002 |
|
||||
| ```cache_queue_port``` | `int` | FastDeploy内部KVCache进程通信端口, 默认8003 |
|
||||
| ```max_model_len``` | `int` | 推理默认最大支持上下文长度,默认2048 |
|
||||
@@ -17,7 +19,7 @@
|
||||
| ```tokenizer``` | `str` | tokenizer 名或路径,默认为模型路径 |
|
||||
| ```use_warmup``` | `int` | 是否在启动时进行warmup,会自动生成极限长度数据进行warmup,默认自动计算KV Cache时会使用 |
|
||||
| ```limit_mm_per_prompt``` | `dict[str]` | 限制每个prompt中多模态数据的数量,如:{"image": 10, "video": 3},默认都为1 |
|
||||
| ```enable_mm``` | `bool` | 是否支持多模态数据(仅针对多模模型),默认False |
|
||||
| ```enable_mm``` | `bool` | __[已废弃]__ 是否支持多模态数据(仅针对多模模型),默认False |
|
||||
| ```quantization``` | `str` | 模型量化策略,当在加载BF16 CKPT时,指定wint4或wint8时,支持无损在线4bit/8bit量化 |
|
||||
| ```gpu_memory_utilization``` | `float` | GPU显存利用率,默认0.9 |
|
||||
| ```num_gpu_blocks_override``` | `int` | 预分配KVCache块数,此参数可由FastDeploy自动根据显存情况计算,无需用户配置,默认为None |
|
||||
@@ -31,8 +33,8 @@
|
||||
| ```long_prefill_token_threshold``` | `int` | 开启Chunked Prefill时,请求Token数超过此值的请求被视为长请求,默认为max_model_len*0.04 |
|
||||
| ```static_decode_blocks``` | `int` | 推理过程中,每条请求强制从Prefill的KVCache分配对应块数给Decode使用,默认2|
|
||||
| ```reasoning_parser``` | `str` | 指定要使用的推理解析器,以便从模型输出中提取推理内容 |
|
||||
| ```use_cudagraph``` | `bool` | 是否使用cuda graph,默认False |
|
||||
|```graph_optimization_config``` | `str` | 可以配置计算图优化相关的参数,默认值为'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }' |
|
||||
| ```use_cudagraph``` | `bool` | 是否使用cuda graph,默认False。开启前建议仔细阅读 [graph_optimization.md](./features/graph_optimization.md),在多卡场景需要同时开启 Custom all-reduce。 |
|
||||
| ```graph_optimization_config``` | `dict[str]` | 可以配置计算图优化相关的参数,默认值为'{"use_cudagraph":false, "graph_opt_level":0, "cudagraph_capture_sizes": null }',详细说明参考 [graph_optimization.md](./features/graph_optimization.md)|
|
||||
| ```enable_custom_all_reduce``` | `bool` | 开启Custom all-reduce,默认False |
|
||||
| ```splitwise_role``` | `str` | 是否开启splitwise推理,默认值mixed, 支持参数为["mixed", "decode", "prefill"] |
|
||||
| ```innode_prefill_ports``` | `str` | prefill 实例内部引擎启动端口 (仅单机PD分离需要),默认值None |
|
||||
@@ -42,6 +44,8 @@
|
||||
| ```dynamic_load_weight``` | `int` | 是否动态加载权重,默认0 |
|
||||
| ```enable_expert_parallel``` | `bool` | 是否启用专家并行 |
|
||||
| ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob,则在启动时可以省略此参数。 |
|
||||
| ```tool_call_parser``` | `str` | 指定要使用的function call解析器,以便从模型输出中抽取 function call内容|
|
||||
| ```tool_parser_plugin``` | `str` | 指定要注册的tool parser文件路径,以便注册不在代码库中的parser,parser中代码格式需遵循代码库中格式|
|
||||
|
||||
## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系?
|
||||
|
||||
@@ -65,84 +69,3 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache
|
||||
当启用 `enable_chunked_prefill` 时,服务通过动态分块处理长输入序列,显著提升GPU资源利用率。在此模式下,原有 `max_num_batched_tokens` 参数不再约束预填充阶段的批处理token数量(限制单次prefill的token数量),因此引入 `max_num_partial_prefills` 参数,专门用于限制同时处理的分块批次数。
|
||||
|
||||
为优化短请求的调度优先级,新增 `max_long_partial_prefills` 与 `long_prefill_token_threshold` 参数组合。前者限制单个预填充批次中的长请求数量,后者定义长请求的token阈值。系统会优先保障短请求的批处理空间,从而在混合负载场景下降低短请求延迟,同时保持整体吞吐稳定。
|
||||
|
||||
## 4. GraphOptimizationBackend 相关配置参数说明
|
||||
当前仅支持用户配置以下参数:
|
||||
- `use_cudagraph` : bool = False
|
||||
- `graph_optimization_config` : Dict[str, Any]
|
||||
- `graph_opt_level`: int = 0
|
||||
- `use_cudagraph`: bool = False
|
||||
- `cudagraph_capture_sizes` : List[int] = None
|
||||
|
||||
可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。
|
||||
|
||||
`--graph-optimization-config` 中的 `graph_opt_level` 参数用于配置图优化等级,可选项如下:
|
||||
- `0`: 动态图,默认为 0
|
||||
- `1`: 静态图,初始化阶段会使用 Paddle API 将动态图转换为静态图
|
||||
- `2`: 在静态图的基础上,使用 Paddle 框架编译器(CINN, Compiler Infrastructure for Neural Networks)进行编译优化
|
||||
|
||||
一般情况下静态图比动态图的 Kernel Launch 开销更小,推荐使用静态图。
|
||||
对于已适配的模型,FastDeploy 的 CudaGraph **可同时支持动态图与静态图**。
|
||||
|
||||
在默认配置下开启 CudaGraph 时,会根据 `max_num_seqs` 参数自动设置 CudaGraph 需要捕获的 Batch Size 列表,需要捕获的 Batch Size 的列表自动生成逻辑如下:
|
||||
1. 生成一个范围为 [1,1024] Batch Size 的候选列表
|
||||
|
||||
```
|
||||
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
|
||||
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
||||
# Batch Size (128, 144, ... 240, 256]
|
||||
candidate_capture_sizes += [16 * i for i in range(9, 17)]
|
||||
# Batch Size (256, 288, ... 992, 1024]
|
||||
candidate_capture_sizes += [32 * i for i in range(17, 33)]
|
||||
```
|
||||
|
||||
2. 根据用户设置的 `max_num_seqs` 裁剪候选列表,得到范围为 [1, `max_num_seqs`] 的 CudaGraph 捕获列表。
|
||||
|
||||
用户也可以通过 `--graph-optimization-config` 中的 `cudagraph_capture_sizes` 参数自定义需要被 CudaGraph 捕获的 Batch Size 列表:
|
||||
|
||||
```
|
||||
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
|
||||
```
|
||||
|
||||
### CudaGraph相关参数说明
|
||||
使用 CudaGraph 会产生一些额外的显存开销,在FastDeploy中分为下面两类:
|
||||
- 额外的输入 Buffer 开销
|
||||
- CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存
|
||||
|
||||
FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存,初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的,因此使用默认启动参数可能会遇到 `Out Of Memory` 错误,可以尝试使用下面三种方式解决:
|
||||
- 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。
|
||||
- 调低 `max_num_seqs` 的值,降低最大并发数。
|
||||
- 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量
|
||||
|
||||
使用CudaGraph之前,需要确保加载的模型被装饰器 ```@support_graph_optimization```正确修饰。
|
||||
|
||||
```python
|
||||
# 1. import 装饰器
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
# 2. 添加装饰器
|
||||
@support_graph_optimization
|
||||
class Ernie4_5_Model(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上
|
||||
...
|
||||
|
||||
# 3. 修改 ModelForCasualLM 子类中 self.model() 的传参方式
|
||||
class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
||||
...
|
||||
def forward(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding, # 传参时指定参数名
|
||||
forward_meta=forward_meta)
|
||||
return hidden_statesfrom fastdeploy.model_executor.graph_optimization.decorator import support_graph_optimization
|
||||
...
|
||||
|
||||
@support_graph_optimization
|
||||
class Ernie45TModel(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上
|
||||
...
|
||||
```
|
||||
|
||||
- 当开启 ```use_cudagraph``` 时,暂时只支持单卡推理,即 ```tensor_parallel_size``` 设为1。
|
||||
- 当开启 ```use_cudagraph``` 时,暂不支持开启 ```enable_prefix_caching``` 或 ```enable_chunked_prefill``` 。
|
||||
|
@@ -1,4 +1,5 @@
|
||||
# FastDeploy 环境变量说明
|
||||
|
||||
FastDeploy 的环境变量保存在了代码库根目录下 fastdeploy/envs.py 文件中,以下是其对应的中文版说明:
|
||||
|
||||
```python
|
||||
@@ -37,7 +38,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# 是否使用 HuggingFace 分词器
|
||||
"FD_USE_HF_TOKENIZER":
|
||||
lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
|
||||
lambda: bool(int(os.getenv("FD_USE_HF_TOKENIZER", 0))),
|
||||
|
||||
# 设置 ZMQ 初始化期间接收数据的高水位标记(HWM)
|
||||
"FD_ZMQ_SNDHWM":
|
||||
|
@@ -3,8 +3,14 @@
|
||||
|-|-|-|-|-|-|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.95|>=2.0.0|
|
||||
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.0.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|>=2.1.0|
|
||||
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|>=2.0.3|
|
||||
@@ -83,4 +89,4 @@ for chunk in response:
|
||||
print('\n')
|
||||
```
|
||||
|
||||
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。
|
||||
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
|
||||
|
@@ -24,7 +24,11 @@ os.environ["GLOG_minloglevel"] = "2"
|
||||
os.environ["AISTUDIO_LOG"] = "critical"
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
from fastdeploy.utils import version
|
||||
from fastdeploy.utils import version, envs
|
||||
from paddleformers.utils.log import logger as pf_logger
|
||||
if envs.FD_DEBUG != "1":
|
||||
import logging
|
||||
pf_logger.logger.setLevel(logging.INFO)
|
||||
|
||||
__all__ = ["LLM", "SamplingParams", "version"]
|
||||
|
||||
|
@@ -142,7 +142,7 @@ class CacheMessager:
|
||||
|
||||
self.gpu_id = gpu_id
|
||||
self.cache_info = dict()
|
||||
self.dp_rank_id = local_data_parallel_id
|
||||
self.dp_rank_id = self.rank + local_data_parallel_id * self.nranks
|
||||
|
||||
layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread)
|
||||
layerwise_send_cache_thread.daemon = True
|
||||
|
@@ -31,6 +31,7 @@ from fastdeploy import envs
|
||||
from fastdeploy.cache_manager.cache_data import BlockNode, CacheStatus
|
||||
from fastdeploy.cache_manager.cache_metrics import CacheMetrics
|
||||
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log")
|
||||
@@ -64,7 +65,10 @@ class PrefixCacheManager:
|
||||
self.speculative_config = config.speculative_config
|
||||
self.local_data_parallel_id = local_data_parallel_id
|
||||
|
||||
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.num_gpu_blocks = self.cache_config.total_block_num
|
||||
else:
|
||||
self.num_gpu_blocks = self.cache_config.prefill_kvcache_block_num
|
||||
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
|
||||
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
|
||||
if self.num_cpu_blocks > 0:
|
||||
@@ -106,6 +110,10 @@ class PrefixCacheManager:
|
||||
+ f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
|
||||
)
|
||||
|
||||
@property
|
||||
def available_gpu_resource(self):
|
||||
return len(self.gpu_free_block_list) / self.num_gpu_blocks if self.num_gpu_blocks > 0 else 0.0
|
||||
|
||||
def launch_cache_manager(
|
||||
self,
|
||||
cache_config,
|
||||
@@ -150,7 +158,7 @@ class PrefixCacheManager:
|
||||
kv_num_head = int(cache_config.model_cfg.num_key_value_heads) // tensor_parallel_size
|
||||
else:
|
||||
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
|
||||
|
||||
kv_num_head = max(1, kv_num_head)
|
||||
cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
|
||||
self.cache_ready_signal = IPCSignal(
|
||||
name="cache_ready_signal",
|
||||
@@ -225,6 +233,9 @@ class PrefixCacheManager:
|
||||
heapq.heapify(self.gpu_free_block_list)
|
||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||
|
||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||
main_process_metrics.available_gpu_resource.set(1.0)
|
||||
|
||||
def _enable_cpu_cache(self):
|
||||
"""
|
||||
_enable_cpu_cache function used to enable cpu cache.
|
||||
@@ -260,6 +271,8 @@ class PrefixCacheManager:
|
||||
logger.info(
|
||||
f"allocate_gpu_blocks: {allocated_block_ids}, len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}"
|
||||
)
|
||||
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
|
||||
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
|
||||
return allocated_block_ids
|
||||
|
||||
def recycle_gpu_blocks(self, gpu_block_ids):
|
||||
@@ -274,6 +287,8 @@ class PrefixCacheManager:
|
||||
heapq.heappush(self.gpu_free_block_list, gpu_block_id)
|
||||
else:
|
||||
heapq.heappush(self.gpu_free_block_list, gpu_block_ids)
|
||||
main_process_metrics.free_gpu_block_num.set(len(self.gpu_free_block_list))
|
||||
main_process_metrics.available_gpu_resource.set(self.available_gpu_resource)
|
||||
|
||||
def allocate_cpu_blocks(self, num_blocks):
|
||||
"""
|
||||
@@ -494,7 +509,7 @@ class PrefixCacheManager:
|
||||
self.metrics.req_count += 1
|
||||
input_ids = task.prompt_token_ids
|
||||
req_id = task.request_id
|
||||
logger.info(f"request_block_ids: start to allocate blocks for req_id {req_id}")
|
||||
logger.info(f"request_match_blocks: start to allocate blocks for req_id {req_id}")
|
||||
input_token_num = len(input_ids)
|
||||
common_block_ids = []
|
||||
# 1. match block
|
||||
@@ -526,7 +541,7 @@ class PrefixCacheManager:
|
||||
cpu_recv_block_ids=[],
|
||||
)
|
||||
else:
|
||||
raise Exception("Not enough GPU memory to allocate cache for matched CPU Cache")
|
||||
raise Exception("request_match_blocks: Not enough GPU memory to allocate cache for matched CPU Cache")
|
||||
|
||||
# record request cache info
|
||||
self.cache_info[req_id] = (match_block_node, input_ids)
|
||||
@@ -548,11 +563,14 @@ class PrefixCacheManager:
|
||||
if self.metrics.req_count % 10000 == 0:
|
||||
self.metrics.reset_metrics()
|
||||
logger.info(
|
||||
f"request_block_ids: request block for req_id {req_id}: common_block_ids {common_block_ids}"
|
||||
f"request_match_blocks: request block for req_id {req_id}: common_block_ids {common_block_ids}"
|
||||
)
|
||||
# set leaf node temporarily, then update it in update_cache_blocks
|
||||
self.req_leaf_map[req_id] = match_block_node
|
||||
self.leaf_req_map[match_block_node].add(req_id)
|
||||
return common_block_ids, matched_token_num, hit_info
|
||||
except Exception as e:
|
||||
logger.error(f"request_block_ids: error: {type(e)} {e}")
|
||||
logger.error(f"request_match_blocks: request_block_ids: error: {type(e)} {e}")
|
||||
raise e
|
||||
|
||||
def request_block_ids(self, task, block_size, dec_token_num, *args):
|
||||
@@ -708,6 +726,43 @@ class PrefixCacheManager:
|
||||
except Exception as e:
|
||||
logger.error(f"release_block_ids: error: {type(e)} {e}")
|
||||
raise e
|
||||
def free_nodes_directly(self, node):
|
||||
"""
|
||||
Recycle nodes by a query directly.
|
||||
"""
|
||||
with self.request_release_lock:
|
||||
try:
|
||||
total_gpu_free_count = 0
|
||||
while True:
|
||||
if node in self.gpu_lru_leaf_heap:
|
||||
self.gpu_lru_leaf_heap.remove(node)
|
||||
self.gpu_lru_leaf_set.remove(node)
|
||||
if node.shared_count == 0 and node.is_gpu_leaf_node: # 直接回收
|
||||
self._handle_free_gpu_node_without_cpu(node)
|
||||
logger.info(f"free_nodes_directly: node {node}")
|
||||
total_gpu_free_count += 1
|
||||
cur_node = node
|
||||
node = node.parent
|
||||
if cur_node.hash_value in node.children:
|
||||
del node.children[cur_node.hash_value]
|
||||
if not node.children:
|
||||
if node in self.gpu_lru_leaf_set:
|
||||
continue
|
||||
if (
|
||||
node != self.radix_tree_root
|
||||
and node.shared_count == 0
|
||||
and node.is_gpu_leaf_node
|
||||
and node.is_persistent is False
|
||||
):
|
||||
heapq.heappush(self.gpu_lru_leaf_heap, node)
|
||||
self.gpu_lru_leaf_set.add(node)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"free_nodes_directly: error: {type(e)} {e}")
|
||||
raise e
|
||||
|
||||
def _handle_free_gpu_node_without_cpu(self, node):
|
||||
"""
|
||||
@@ -1051,6 +1106,15 @@ class PrefixCacheManager:
|
||||
node.last_used_time = current_time
|
||||
node.req_id_set.add(req_id)
|
||||
node = node.parent
|
||||
|
||||
def decrease_request_share_count(self, req_id):
|
||||
"""
|
||||
Decrease node shared count
|
||||
"""
|
||||
node, input_ids = self.cache_info[req_id]
|
||||
while node != self.radix_tree_root:
|
||||
node.decrement_shared_count()
|
||||
node = node.parent
|
||||
|
||||
def build_path(
|
||||
self,
|
||||
|
@@ -118,6 +118,7 @@ class ModelConfig:
|
||||
self.enable_redundant_experts = False
|
||||
self.redundant_experts_num = 0
|
||||
self.quantization = None
|
||||
self.think_end_id = None
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
@@ -726,7 +727,10 @@ class CacheConfig:
|
||||
self.block_size = 64
|
||||
self.gpu_memory_utilization = 0.9
|
||||
self.num_gpu_blocks_override = None
|
||||
self.kv_cache_ratio = 0.75
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.kv_cache_ratio = 1.0
|
||||
else:
|
||||
self.kv_cache_ratio = 0.75
|
||||
self.enc_dec_block_num = 2
|
||||
self.prealloc_dec_block_slot_num_threshold = 5
|
||||
self.cache_dtype = "bfloat16"
|
||||
@@ -811,7 +815,10 @@ class CacheConfig:
|
||||
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
||||
if self.num_gpu_blocks_override is not None:
|
||||
self.total_block_num = self.num_gpu_blocks_override
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.prefill_kvcache_block_num = self.total_block_num
|
||||
else:
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
else:
|
||||
length = num_total_tokens // number_of_tasks
|
||||
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
|
||||
@@ -824,7 +831,10 @@ class CacheConfig:
|
||||
reset gpu block number
|
||||
"""
|
||||
self.total_block_num = num_gpu_blocks
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.prefill_kvcache_block_num = self.total_block_num
|
||||
else:
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
logger.info(
|
||||
f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
|
||||
|
@@ -15,10 +15,13 @@
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import asdict, dataclass
|
||||
from dataclasses import fields as dataclass_fields
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy.config import (
|
||||
CacheConfig,
|
||||
EarlyStopConfig,
|
||||
@@ -92,6 +95,14 @@ class EngineArgs:
|
||||
"""
|
||||
specifies the reasoning parser to use for extracting reasoning content from the model output
|
||||
"""
|
||||
tool_call_parser: str = None
|
||||
"""
|
||||
specifies the tool call parser to use for extracting tool call from the model output
|
||||
"""
|
||||
tool_parser_plugin: str = None
|
||||
"""
|
||||
tool parser plugin used to register user defined tool parsers
|
||||
"""
|
||||
enable_mm: bool = False
|
||||
"""
|
||||
Flags to enable multi-modal model
|
||||
@@ -420,6 +431,18 @@ class EngineArgs:
|
||||
help="Flag specifies the reasoning parser to use for extracting "
|
||||
"reasoning content from the model output",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--tool-call-parser",
|
||||
type=str,
|
||||
default=EngineArgs.tool_call_parser,
|
||||
help="Flag specifies the tool call parser to use for extracting" "tool call from the model output",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--tool-parser-plugin",
|
||||
type=str,
|
||||
default=EngineArgs.tool_parser_plugin,
|
||||
help="tool parser plugin used to register user defined tool parsers",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--speculative-config",
|
||||
type=json.loads,
|
||||
@@ -865,7 +888,13 @@ class EngineArgs:
|
||||
if self.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
else:
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
if paddle.is_compiled_with_xpu():
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
self.max_num_batched_tokens = 8192
|
||||
|
||||
all_dict = asdict(self)
|
||||
all_dict["model_cfg"] = model_cfg
|
||||
@@ -904,6 +933,7 @@ class EngineArgs:
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
enable_mm=self.enable_mm,
|
||||
reasoning_parser=self.reasoning_parser,
|
||||
tool_parser=self.tool_call_parser,
|
||||
splitwise_role=self.splitwise_role,
|
||||
innode_prefill_ports=self.innode_prefill_ports,
|
||||
max_num_partial_prefills=self.max_num_partial_prefills,
|
||||
|
@@ -85,6 +85,7 @@ class Config:
|
||||
max_long_partial_prefills: int = 1,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
reasoning_parser: str = None,
|
||||
tool_parser: str = None,
|
||||
guided_decoding_backend: Optional[str] = None,
|
||||
disable_any_whitespace: bool = False,
|
||||
enable_logprob: bool = False,
|
||||
@@ -165,6 +166,7 @@ class Config:
|
||||
self.max_long_partial_prefills = max_long_partial_prefills
|
||||
self.long_prefill_token_threshold = long_prefill_token_threshold
|
||||
self.reasoning_parser = reasoning_parser
|
||||
self.tool_parser = tool_parser
|
||||
self.graph_optimization_config = graph_optimization_config
|
||||
self.early_stop_config = early_stop_config
|
||||
self.guided_decoding_backend = guided_decoding_backend
|
||||
@@ -236,7 +238,13 @@ class Config:
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
else:
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
if paddle.is_compiled_with_xpu():
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
else:
|
||||
self.max_num_batched_tokens = 8192
|
||||
|
||||
if self.long_prefill_token_threshold == 0:
|
||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||
@@ -284,10 +292,11 @@ class Config:
|
||||
)
|
||||
|
||||
if not self.cache_config.enable_chunked_prefill:
|
||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||
)
|
||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||
)
|
||||
else:
|
||||
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
|
@@ -47,12 +47,14 @@ from fastdeploy.inter_communicator import (
|
||||
EngineCacheQueue,
|
||||
EngineWorkerQueue,
|
||||
IPCSignal,
|
||||
ZmqClient,
|
||||
ZmqIpcServer,
|
||||
ZmqTcpServer,
|
||||
)
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.metrics.trace_util import start_span, start_span_request
|
||||
from fastdeploy.model_executor.guided_decoding import schema_checker
|
||||
from fastdeploy.output.token_processor import TokenProcessor, WarmUpTokenProcessor
|
||||
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
|
||||
from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
|
||||
from fastdeploy.utils import EngineError, console_logger, envs, llm_logger
|
||||
|
||||
@@ -106,6 +108,7 @@ class LLMEngine:
|
||||
cfg.limit_mm_per_prompt,
|
||||
cfg.mm_processor_kwargs,
|
||||
cfg.enable_mm,
|
||||
cfg.tool_parser,
|
||||
)
|
||||
|
||||
self.start_queue_service()
|
||||
@@ -123,8 +126,9 @@ class LLMEngine:
|
||||
cfg.max_num_seqs, cfg, cfg.tensor_parallel_size, cfg.splitwise_role
|
||||
)
|
||||
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.engine_worker_queue_port)
|
||||
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(
|
||||
self.cfg.engine_worker_queue_port + self.cfg.worker_num_per_node * self.cfg.node_rank
|
||||
)
|
||||
self.split_connector = SplitwiseConnector(cfg, self.scheduler, self.engine_worker_queue, self.resource_manager)
|
||||
|
||||
self.token_processor = TokenProcessor(
|
||||
@@ -179,9 +183,19 @@ class LLMEngine:
|
||||
self.data_processor = self.input_processor.create_processor()
|
||||
|
||||
if api_server_pid is not None:
|
||||
self.zmq_server = ZmqClient(name=api_server_pid, mode=zmq.PULL)
|
||||
self.zmq_server.start_server()
|
||||
self.zmq_server.create_router()
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL)
|
||||
self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER)
|
||||
self.external_adapter = InternalAdapter(
|
||||
cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node
|
||||
)
|
||||
else:
|
||||
self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL)
|
||||
self.send_response_server = ZmqIpcServer(name=api_server_pid, mode=zmq.ROUTER)
|
||||
self.recv_result_handle_thread = threading.Thread(
|
||||
target=self.send_response_server.recv_result_handle, daemon=True
|
||||
)
|
||||
self.recv_result_handle_thread.start()
|
||||
time.sleep(3)
|
||||
|
||||
if self.do_profile == 0 and (
|
||||
@@ -196,13 +210,42 @@ class LLMEngine:
|
||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||
pid_suffix=self.ipc_signal_suffix,
|
||||
)
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
self.worker_proc = self._start_worker_service()
|
||||
console_logger.info("Waitting worker processes ready...")
|
||||
console_logger.info("Waiting for worker processes to be ready...")
|
||||
time.sleep(5)
|
||||
self.worker_init_status = dict()
|
||||
if not self.check_worker_initialize_status():
|
||||
|
||||
result_container = {}
|
||||
|
||||
def check_worker_initialize_status_func(res: dict):
|
||||
res["worker_is_alive"] = True
|
||||
if not self.check_worker_initialize_status():
|
||||
console_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.")
|
||||
res["worker_is_alive"] = False
|
||||
|
||||
self.check_worker_initialize_status_func_thread = threading.Thread(
|
||||
target=check_worker_initialize_status_func, args=(result_container,), daemon=True
|
||||
)
|
||||
self.check_worker_initialize_status_func_thread.start()
|
||||
|
||||
# Wait model loading
|
||||
while self.loaded_model_signal.value[0] == 0:
|
||||
# Make sure worker process is alive
|
||||
if not self.check_worker_initialize_status_func_thread.is_alive():
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
if self.do_profile:
|
||||
self._stop_profile()
|
||||
# Launch components: scheduler, cache_manager, expert_service et.al.
|
||||
self.launch_components()
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
# Worker launched
|
||||
self.check_worker_initialize_status_func_thread.join()
|
||||
if not result_container["worker_is_alive"]:
|
||||
console_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.")
|
||||
return False
|
||||
|
||||
@@ -214,68 +257,6 @@ class LLMEngine:
|
||||
self._del_warmup_token_processor()
|
||||
console_logger.info("Warmup finished")
|
||||
|
||||
self.token_processor.tasks_queue = self.engine_worker_queue
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.insert_task_to_worker_thread = threading.Thread(target=self._scheduler_task_to_worker_v1, daemon=True)
|
||||
else:
|
||||
self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, daemon=True)
|
||||
self.insert_task_to_worker_thread.start()
|
||||
|
||||
if self.api_server_pid is not None:
|
||||
self.insert_task_to_scheduler_thread = threading.Thread(
|
||||
target=self._insert_zmq_task_to_scheduler, daemon=True
|
||||
)
|
||||
self.insert_task_to_scheduler_thread.start()
|
||||
|
||||
self.receive_output_thread = threading.Thread(target=self._zmq_send_generated_tokens, daemon=True)
|
||||
self.receive_output_thread.start()
|
||||
|
||||
# Start TokenProcessor thread
|
||||
self.token_processor.run()
|
||||
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
# 单机逻辑
|
||||
self.engine_worker_queue.available_prefill_instances.put(1)
|
||||
self.split_mode_get_tasks()
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.splitwise_receive_thread = threading.Thread(target=self.split_connector.start_receiver, args=())
|
||||
self.splitwise_receive_thread.daemon = True
|
||||
self.splitwise_receive_thread.start()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.scheduler.start(role, host_ip, disaggregate)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
||||
self.dp_processed = []
|
||||
for i in range(
|
||||
1,
|
||||
self.cfg.parallel_config.data_parallel_size // self.cfg.nnode,
|
||||
):
|
||||
time.sleep(1)
|
||||
self.dp_processed.append(
|
||||
multiprocessing.Process(
|
||||
target=start_expert_service,
|
||||
args=(
|
||||
self.cfg,
|
||||
i + self.cfg.node_rank * self.cfg.worker_num_per_node,
|
||||
self.ipc_signal_suffix,
|
||||
),
|
||||
)
|
||||
)
|
||||
llm_logger.info(
|
||||
f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}"
|
||||
+ f" data parallel id {i}"
|
||||
)
|
||||
self.dp_processed[-1].start()
|
||||
|
||||
console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
|
||||
return True
|
||||
|
||||
@@ -291,7 +272,7 @@ class LLMEngine:
|
||||
time.sleep(0.005)
|
||||
continue
|
||||
for request_id, contents in results.items():
|
||||
self.zmq_server.send_multipart(request_id, contents)
|
||||
self.send_response_server.send_response(request_id, contents)
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
|
||||
@@ -400,6 +381,8 @@ class LLMEngine:
|
||||
get_request_pool.submit(_fetch_request)
|
||||
# 2. Schedule requests
|
||||
tasks = self.resource_manager.schedule()
|
||||
main_process_metrics.num_requests_waiting.dec(len(tasks))
|
||||
main_process_metrics.num_requests_running.inc(len(tasks))
|
||||
# 3. Send to engine
|
||||
if tasks:
|
||||
self.resource_manager.get_real_bsz()
|
||||
@@ -420,9 +403,9 @@ class LLMEngine:
|
||||
try:
|
||||
block = True if len(added_requests) == 0 else False
|
||||
if not self.cfg.enable_mm:
|
||||
err, data = self.zmq_server.receive_json_once(block)
|
||||
err, data = self.recv_request_server.receive_json_once(block)
|
||||
else:
|
||||
err, data = self.zmq_server.receive_pyobj_once(block)
|
||||
err, data = self.recv_request_server.receive_pyobj_once(block)
|
||||
if err is not None:
|
||||
llm_logger.error("Engine stops inserting zmq task into scheduler, err:{err}")
|
||||
break
|
||||
@@ -430,19 +413,24 @@ class LLMEngine:
|
||||
request, insert_task = None, []
|
||||
results: List[Tuple[str, Optional[str]]] = list()
|
||||
if data:
|
||||
request = Request.from_dict(data)
|
||||
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
|
||||
|
||||
llm_logger.debug(f"Receive request: {request}")
|
||||
|
||||
err_msg = None
|
||||
if self.guided_decoding_checker is not None:
|
||||
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
||||
try:
|
||||
request = Request.from_dict(data)
|
||||
start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER)
|
||||
main_process_metrics.requests_number.inc()
|
||||
llm_logger.debug(f"Receive request: {request}")
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
|
||||
err_msg = str(e)
|
||||
results.append((data["request_id"], err_msg))
|
||||
|
||||
if err_msg is not None:
|
||||
llm_logger.error(err_msg)
|
||||
results.append((request.request_id, err_msg))
|
||||
else:
|
||||
if self.guided_decoding_checker is not None and err_msg is None:
|
||||
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
||||
if err_msg is not None:
|
||||
llm_logger.error(f"Receive request error: {err_msg}")
|
||||
results.append((request.request_id, err_msg))
|
||||
|
||||
if err_msg is None:
|
||||
insert_task.append(request)
|
||||
|
||||
response = self.scheduler.put_requests(insert_task)
|
||||
@@ -454,9 +442,10 @@ class LLMEngine:
|
||||
added_requests[request.request_id] += 1
|
||||
|
||||
for request_id, failed in results:
|
||||
added_requests[request_id] -= 1
|
||||
if added_requests[request_id] == 0:
|
||||
added_requests.pop(request_id)
|
||||
if request_id in added_requests:
|
||||
added_requests[request_id] -= 1
|
||||
if added_requests[request_id] == 0:
|
||||
added_requests.pop(request_id)
|
||||
|
||||
if failed is None:
|
||||
main_process_metrics.num_requests_waiting.inc(1)
|
||||
@@ -470,7 +459,7 @@ class LLMEngine:
|
||||
)
|
||||
# Since the request is not in scheduler
|
||||
# Send result by zmq directly
|
||||
self.zmq_server.send_multipart(request_id, error_result)
|
||||
self.send_response_server.send_response(request_id, [error_result])
|
||||
except Exception as e:
|
||||
llm_logger.error(
|
||||
f"Error happend while receving new request from zmq, details={e}, "
|
||||
@@ -497,10 +486,7 @@ class LLMEngine:
|
||||
request.sampling_params = sampling_params
|
||||
request.preprocess_start_time = time.time()
|
||||
|
||||
enable_thinking = None
|
||||
if kwargs is not None:
|
||||
enable_thinking = kwargs.get("enable_thinking", None)
|
||||
request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking)
|
||||
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
|
||||
request.prompt_token_ids_len = len(request.prompt_token_ids)
|
||||
request.need_prefill_tokens = request.prompt_token_ids_len
|
||||
input_ids_len = request.prompt_token_ids_len
|
||||
@@ -530,6 +516,26 @@ class LLMEngine:
|
||||
llm_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
|
||||
if request.get("stop_seqs_len") is not None:
|
||||
stop_seqs_len = request.get("stop_seqs_len")
|
||||
max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
||||
if len(stop_seqs_len) > max_stop_seqs_num:
|
||||
error_msg = (
|
||||
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
||||
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
||||
)
|
||||
llm_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
||||
for single_stop_seq_len in stop_seqs_len:
|
||||
if single_stop_seq_len > stop_seqs_max_len:
|
||||
error_msg = (
|
||||
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
||||
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
||||
)
|
||||
llm_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
|
||||
if self.guided_decoding_checker is not None:
|
||||
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
||||
if err_msg is not None:
|
||||
@@ -749,10 +755,6 @@ class LLMEngine:
|
||||
"""
|
||||
Insert tasks to engine.
|
||||
"""
|
||||
for task in tasks:
|
||||
start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
|
||||
if task.sampling_params.bad_words is not None:
|
||||
task.sampling_params.update_from_tokenizer(self.data_processor.tokenizer)
|
||||
# TODO 返回至 scheduler
|
||||
if allocated:
|
||||
current_tasks = []
|
||||
@@ -779,6 +781,11 @@ class LLMEngine:
|
||||
self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz))
|
||||
return True
|
||||
|
||||
for task in tasks:
|
||||
start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER)
|
||||
if task.sampling_params.bad_words is not None:
|
||||
task.sampling_params.update_from_tokenizer(self.data_processor.tokenizer)
|
||||
|
||||
self.resource_manager.check_and_free_block_tables()
|
||||
|
||||
if not isinstance(tasks, list):
|
||||
@@ -918,7 +925,6 @@ class LLMEngine:
|
||||
suffix=self.ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# launched_cache_manager_signal 用于感知engine是否启动了cache_manager
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
launched_cache_manager_signal_data = np.zeros([1], dtype=np.int32)
|
||||
@@ -930,6 +936,29 @@ class LLMEngine:
|
||||
create=True,
|
||||
)
|
||||
|
||||
# launched_expert_service_signal: Used to sense whether each expet_servic is started successfully
|
||||
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
||||
launched_expert_service_signal_data = np.zeros(
|
||||
shape=[self.cfg.parallel_config.data_parallel_size // self.cfg.nnode], dtype=np.int32
|
||||
)
|
||||
self.launched_expert_service_signal = IPCSignal(
|
||||
name="launched_expert_service_signal",
|
||||
array=launched_expert_service_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# loaded_model_signal: Used to detect whether each worker has completed model loading
|
||||
loaded_model_signal_data = np.zeros([1], dtype=np.int32)
|
||||
self.loaded_model_signal = IPCSignal(
|
||||
name="loaded_model_signal",
|
||||
array=loaded_model_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间
|
||||
worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32)
|
||||
self.worker_healthy_live_signal = IPCSignal(
|
||||
@@ -989,8 +1018,12 @@ class LLMEngine:
|
||||
print(f"Error extracting sub services: {e}")
|
||||
|
||||
self.engine_worker_queue.cleanup()
|
||||
if hasattr(self, "zmq_server") and self.zmq_server is not None:
|
||||
self.zmq_server.close()
|
||||
if hasattr(self, "send_response_server") and self.send_response_server is not None:
|
||||
self.send_response_server.close()
|
||||
if hasattr(self, "recv_request_server") and self.recv_request_server is not None:
|
||||
self.recv_request_server.close()
|
||||
if hasattr(self, "recv_control_cmd_server") and self.recv_control_cmd_server is not None:
|
||||
self.recv_control_cmd_server.close()
|
||||
if hasattr(self, "dp_processed"):
|
||||
for p in self.dp_processed:
|
||||
p.join()
|
||||
@@ -1208,7 +1241,6 @@ class LLMEngine:
|
||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||
pid_suffix=self.ipc_signal_suffix,
|
||||
)
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
def check_health(self, time_interval_threashold=30):
|
||||
"""
|
||||
@@ -1222,6 +1254,72 @@ class LLMEngine:
|
||||
|
||||
return True, ""
|
||||
|
||||
def launch_components(self):
|
||||
self.token_processor.tasks_queue = self.engine_worker_queue
|
||||
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.insert_task_to_worker_thread = threading.Thread(target=self._scheduler_task_to_worker_v1, daemon=True)
|
||||
else:
|
||||
self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, daemon=True)
|
||||
self.insert_task_to_worker_thread.start()
|
||||
|
||||
if self.api_server_pid is not None:
|
||||
self.insert_task_to_scheduler_thread = threading.Thread(
|
||||
target=self._insert_zmq_task_to_scheduler, daemon=True
|
||||
)
|
||||
self.insert_task_to_scheduler_thread.start()
|
||||
|
||||
self.receive_output_thread = threading.Thread(target=self._zmq_send_generated_tokens, daemon=True)
|
||||
self.receive_output_thread.start()
|
||||
|
||||
# Start TokenProcessor thread
|
||||
self.token_processor.run()
|
||||
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
# 单机逻辑
|
||||
self.engine_worker_queue.available_prefill_instances.put(1)
|
||||
self.split_mode_get_tasks()
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.splitwise_receive_thread = threading.Thread(target=self.split_connector.start_receiver, args=())
|
||||
self.splitwise_receive_thread.daemon = True
|
||||
self.splitwise_receive_thread.start()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.scheduler.start(role, host_ip, disaggregate)
|
||||
|
||||
time.sleep(1)
|
||||
expert_service_nums = self.cfg.parallel_config.data_parallel_size // self.cfg.nnode
|
||||
if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1:
|
||||
self.dp_processed = []
|
||||
for i in range(
|
||||
1,
|
||||
expert_service_nums,
|
||||
):
|
||||
time.sleep(1)
|
||||
self.dp_processed.append(
|
||||
multiprocessing.Process(
|
||||
target=start_expert_service,
|
||||
args=(
|
||||
self.cfg,
|
||||
i + self.cfg.node_rank * self.cfg.worker_num_per_node,
|
||||
self.ipc_signal_suffix,
|
||||
),
|
||||
)
|
||||
)
|
||||
llm_logger.info(
|
||||
f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}"
|
||||
+ f" data parallel id {i}"
|
||||
)
|
||||
self.dp_processed[-1].start()
|
||||
for i in range(1, expert_service_nums):
|
||||
while self.launched_expert_service_signal.value[i] == 0:
|
||||
time.sleep(1)
|
||||
|
||||
def check_worker_initialize_status(self):
|
||||
"""
|
||||
Check the initlialize status of workers by stdout logging
|
||||
@@ -1247,10 +1345,6 @@ class LLMEngine:
|
||||
|
||||
self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True)
|
||||
self.checking_worker_status_thread.start()
|
||||
checking_worker_init_kv_cache_status_thread = None
|
||||
if self.do_profile:
|
||||
checking_worker_init_kv_cache_status_thread = threading.Thread(target=self._stop_profile, daemon=True)
|
||||
checking_worker_init_kv_cache_status_thread.start()
|
||||
|
||||
# display weight loadding progress
|
||||
with tqdm(total=100, desc="Loading Weights") as pbar:
|
||||
@@ -1281,8 +1375,6 @@ class LLMEngine:
|
||||
self.worker_init_status["finished"] = True
|
||||
try:
|
||||
self.checking_worker_status_thread.join(timeout=1)
|
||||
if checking_worker_init_kv_cache_status_thread is not None:
|
||||
checking_worker_init_kv_cache_status_thread.join(timeout=1)
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
@@ -26,7 +26,7 @@ import weakref
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue, IPCSignal
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.output.token_processor import TokenProcessor
|
||||
from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
|
||||
@@ -59,8 +59,8 @@ class ExpertService:
|
||||
self.cfg.disaggregate_info = None
|
||||
|
||||
self.scheduler = cfg.scheduler_config.scheduler()
|
||||
|
||||
self.scheduler.reset_nodeid(f"{self.scheduler.infer.nodeid}_{local_data_parallel_id!s}")
|
||||
if cfg.scheduler_config.name == "splitwise":
|
||||
self.scheduler.reset_nodeid(f"{self.scheduler.infer.nodeid}_{local_data_parallel_id!s}")
|
||||
|
||||
self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id
|
||||
|
||||
@@ -127,7 +127,7 @@ class ExpertService:
|
||||
cache_config=self.cfg.cache_config,
|
||||
tensor_parallel_size=self.cfg.tensor_parallel_size,
|
||||
device_ids=self.cfg.local_device_ids,
|
||||
pod_ip=self.cfg.pod_ips[0],
|
||||
pod_ip=self.cfg.master_ip,
|
||||
engine_worker_queue_port=self.cfg.engine_worker_queue_port,
|
||||
pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}",
|
||||
)
|
||||
@@ -143,14 +143,29 @@ class ExpertService:
|
||||
self.token_processor.run()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
self.scheduler.start(role, host_ip, disaggregate)
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
role = self.cfg.splitwise_role
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
self.scheduler.start(role, host_ip, disaggregate)
|
||||
self.cfg.print()
|
||||
|
||||
console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
|
||||
launched_expert_service_signal_data = np.zeros(
|
||||
shape=[self.cfg.parallel_config.data_parallel_size // self.cfg.nnode], dtype=np.int32
|
||||
)
|
||||
self.launched_expert_service_signal = IPCSignal(
|
||||
name="launched_expert_service_signal",
|
||||
array=launched_expert_service_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=ipc_signal_suffix,
|
||||
create=False,
|
||||
)
|
||||
local_rank = local_data_parallel_id % self.cfg.worker_num_per_node
|
||||
self.launched_expert_service_signal.value[local_rank] = 1
|
||||
|
||||
console_logger.info(
|
||||
f"Worker processes(rank {local_rank}) are launched with {time.time() - start_time} seconds."
|
||||
)
|
||||
return True
|
||||
|
||||
def _insert_task_to_worker(self):
|
||||
@@ -351,7 +366,6 @@ class ExpertService:
|
||||
os.killpg(p.pid, signal.SIGTERM)
|
||||
except:
|
||||
pass
|
||||
|
||||
if hasattr(self, "zmq_server") and self.zmq_server is not None:
|
||||
self.zmq_server.close()
|
||||
|
||||
@@ -363,6 +377,13 @@ def start_expert_service(cfg, local_data_parallel_id, ipc_signal_suffix):
|
||||
expert_service = ExpertService(cfg, local_data_parallel_id)
|
||||
try:
|
||||
expert_service.start(ipc_signal_suffix, local_data_parallel_id)
|
||||
expert_service.split_connector.start_receiver()
|
||||
if cfg.splitwise_role != "mixed":
|
||||
expert_service.split_connector.start_receiver()
|
||||
else:
|
||||
def deamon_thread():
|
||||
while True:
|
||||
time.sleep(10)
|
||||
t_deamon = threading.Thread(target=deamon_thread, daemon=True)
|
||||
t_deamon.start()
|
||||
except Exception as e:
|
||||
llm_logger.exception(f"Expert service failed to start: {e}")
|
||||
|
@@ -24,6 +24,7 @@ from typing import Any, Dict, Optional, Union
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.openai.protocol import ToolCall
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
from fastdeploy.worker.output import LogprobsLists, SampleLogprobs
|
||||
|
||||
@@ -249,6 +250,7 @@ class CompletionOutput:
|
||||
draft_token_ids: list[int] = None
|
||||
text: Optional[str] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
tool_calls: Optional[ToolCall] = None
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
|
@@ -51,14 +51,15 @@ class ResourceManager:
|
||||
"""
|
||||
self.cfg = config.cache_config
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.stop_flags = [True] * max_num_seqs
|
||||
self.stop_flags = [True] * max_num_seqs # flag set to true if the slot has not been taken
|
||||
self.enable_prefix_cache = config.cache_config.enable_prefix_caching
|
||||
self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, splitwise_role, local_data_parallel_id)
|
||||
self.tasks_list = [None] * max_num_seqs
|
||||
self.tasks_list = [None] * max_num_seqs # task slots
|
||||
self.req_dict = dict()
|
||||
# current batch status of the engine
|
||||
self.real_bsz = 0
|
||||
llm_logger.info(f"{self.info()}")
|
||||
main_process_metrics.max_batch_size.set(max_num_seqs)
|
||||
|
||||
def reset_cache_config(self, cfg):
|
||||
"""
|
||||
@@ -222,30 +223,30 @@ class ResourceManager:
|
||||
Returns:
|
||||
list: processed task list
|
||||
"""
|
||||
|
||||
allocated_position = 0
|
||||
processing_task_index = 0
|
||||
allocated_position = 0 # number of tasks that have been allocated, also the position in request slots
|
||||
processing_task_index = 0 # current task
|
||||
processed_tasks = list()
|
||||
while allocated_position < self.max_num_seqs:
|
||||
if processing_task_index >= len(tasks):
|
||||
while allocated_position < self.max_num_seqs: # loop until all tasks are allocated resources for
|
||||
if processing_task_index >= len(tasks): # if all taskes have been tried, don't give a second chance
|
||||
break
|
||||
|
||||
can_insert = False
|
||||
while allocated_position + 1 <= self.max_num_seqs:
|
||||
if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1:
|
||||
can_insert = True
|
||||
can_insert = True # if there is a empty slot, try to allocate resources for current task
|
||||
break
|
||||
allocated_position += 1
|
||||
if can_insert:
|
||||
if self.stop_flags[allocated_position]:
|
||||
|
||||
task = tasks[processing_task_index]
|
||||
task = tasks[processing_task_index] # retrieve current task
|
||||
|
||||
if task.get("seed") is None:
|
||||
task.set("seed", random.randint(0, 9223372036854775807))
|
||||
task.idx = allocated_position
|
||||
|
||||
if self.enable_prefix_cache:
|
||||
if self.enable_prefix_cache: # if prefix caching is enabled
|
||||
# 1. request for enough blocks for current task
|
||||
cache_prepare_time = time.time()
|
||||
common_block_ids, unique_block_ids, hit_info = self.cache_manager.request_block_ids(
|
||||
task,
|
||||
@@ -255,14 +256,15 @@ class ResourceManager:
|
||||
if unique_block_ids is None:
|
||||
llm_logger.warning("req_id: {0} not enough blocks available".format(task["req_id"]))
|
||||
return
|
||||
|
||||
# 2. record cache hit information, and return the number of tokens already in cache
|
||||
cached_len = self._record_request_cache_info(
|
||||
task, common_block_ids, unique_block_ids, hit_info
|
||||
)
|
||||
task.cache_prepare_time = time.time() - cache_prepare_time
|
||||
|
||||
# 3. if prefill/decode disaggregation is enabled
|
||||
if task.disaggregate_info is not None:
|
||||
if task.disaggregate_info["role"] == "prefill":
|
||||
# record the slot position for current task, indexed by request id
|
||||
self.req_dict[task.request_id] = allocated_position
|
||||
task.disaggregate_info["block_tables"] = task.block_tables
|
||||
self._delete_cached_data(task, cached_len)
|
||||
@@ -270,17 +272,19 @@ class ResourceManager:
|
||||
self.req_dict[task.request_id] = allocated_position
|
||||
task.disaggregate_info["block_tables"] = task.need_block_tables
|
||||
else:
|
||||
# remove cached tokens from prompt token ids to avoid kv recomputation
|
||||
self._delete_cached_data(task, cached_len)
|
||||
|
||||
else:
|
||||
else: # if prefix caching is disabled
|
||||
# 1. directly allocate empty block from the cache, if there is any
|
||||
block_tables = self._get_block_tables(task.prompt_token_ids_len)
|
||||
if not block_tables:
|
||||
llm_logger.error(f"req_id: {task.request_id} block_tables is empty")
|
||||
continue
|
||||
continue # retry
|
||||
else:
|
||||
task.block_tables = block_tables
|
||||
task.need_block_tables = task.block_tables
|
||||
|
||||
# 2. if prefill/decode disaggregation is enabled
|
||||
if task.disaggregate_info is not None:
|
||||
task.disaggregate_info["block_tables"] = block_tables
|
||||
if task.disaggregate_info["role"] == "prefill":
|
||||
@@ -288,8 +292,8 @@ class ResourceManager:
|
||||
elif task.disaggregate_info["role"] == "decode":
|
||||
self.req_dict[task.request_id] = allocated_position
|
||||
|
||||
processed_tasks.append(task)
|
||||
self.stop_flags[allocated_position] = False
|
||||
processed_tasks.append(task) # add current task
|
||||
self.stop_flags[allocated_position] = False # mark the slot as occupied
|
||||
task.inference_start_time = time.time()
|
||||
task.inference_time_cost = -1.0
|
||||
task.tokens_all_num = 0
|
||||
@@ -303,11 +307,18 @@ class ResourceManager:
|
||||
processing_task_index += 1
|
||||
|
||||
# batch size when the statistical engine is inferring
|
||||
# determine batch size by index of the first slot that is not occupied
|
||||
for i in range(self.max_num_seqs - 1, -1, -1):
|
||||
if not self.stop_flags[i]:
|
||||
self.real_bsz = i + 1
|
||||
break
|
||||
|
||||
# record batch size here
|
||||
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
||||
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
|
||||
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
||||
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
||||
|
||||
llm_logger.info(
|
||||
f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}"
|
||||
)
|
||||
@@ -339,6 +350,11 @@ class ResourceManager:
|
||||
task.cpu_cache_token_num = hit_info["cpu_cache_blocks"] * self.cfg.block_size
|
||||
task.cache_info = (cache_block_num, no_cache_block_num)
|
||||
|
||||
# Report the number of cached tokens to Prometheus metrics
|
||||
main_process_metrics.prefix_cache_token_num.inc(task.num_cached_tokens)
|
||||
main_process_metrics.prefix_gpu_cache_token_num.inc(task.gpu_cache_token_num)
|
||||
main_process_metrics.prefix_cpu_cache_token_num.inc(task.cpu_cache_token_num)
|
||||
|
||||
cached_len = len(common_block_ids) * self.cfg.block_size
|
||||
task.block_tables = common_block_ids + unique_block_ids
|
||||
task.need_block_tables = unique_block_ids
|
||||
|
@@ -218,20 +218,22 @@ class SamplingParams:
|
||||
prompt_token_ids = tokenizer.encode(text=prompt, add_special_tokens=False)["input_ids"]
|
||||
|
||||
if len(prompt_token_ids) != 1:
|
||||
logger.warning(
|
||||
f"Skip bad_words: {prompt}."
|
||||
f"Bad words should be a single token."
|
||||
f"Got tokens: {prompt_token_ids}."
|
||||
)
|
||||
if not add_prefix_space:
|
||||
logger.warning(
|
||||
f"Skip bad_words: <{prompt}>."
|
||||
f"Bad words should be a single token."
|
||||
f"Got tokens: {prompt_token_ids}."
|
||||
)
|
||||
continue
|
||||
|
||||
if prompt_token_ids[0] > tokenizer.vocab_size:
|
||||
logger.warning(
|
||||
f"Skip bad_words: {prompt}."
|
||||
f"All token id values should be satisfying:"
|
||||
f" 0 <= token_id < {tokenizer.vocab_size}."
|
||||
f"Got token: {prompt_token_ids}."
|
||||
)
|
||||
if not add_prefix_space:
|
||||
logger.warning(
|
||||
f"Skip bad_words: <{prompt}>."
|
||||
f"All token id values should be satisfying:"
|
||||
f" 0 <= token_id < {tokenizer.vocab_size}."
|
||||
f"Got token: {prompt_token_ids}."
|
||||
)
|
||||
continue
|
||||
|
||||
if prompt_token_ids not in self._bad_words_token_ids:
|
||||
|
@@ -27,6 +27,7 @@ import paddle
|
||||
|
||||
from fastdeploy.engine.request import Request, RequestStatus, RequestType
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
|
||||
@@ -75,6 +76,8 @@ class ResourceManagerV1(ResourceManager):
|
||||
self.running: list[Request] = []
|
||||
self.finish_execution_pool = ThreadPoolExecutor(max_workers=1)
|
||||
self.lock = threading.Lock()
|
||||
self.to_be_rescheduled_request_id_set = set()
|
||||
main_process_metrics.max_batch_size.set(max_num_seqs)
|
||||
|
||||
def allocated_slots(self, request: Request):
|
||||
return len(request.block_tables) * self.config.cache_config.block_size
|
||||
@@ -97,16 +100,26 @@ class ResourceManagerV1(ResourceManager):
|
||||
def _prepare_preempt_task(self, request):
|
||||
return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id)
|
||||
|
||||
def reschedule_preempt_task(self, request_id):
|
||||
with self.lock:
|
||||
if request_id in self.to_be_rescheduled_request_id_set and request_id in self.requests:
|
||||
request = self.requests[request_id]
|
||||
self.waiting.appendleft(request)
|
||||
self.to_be_rescheduled_request_id_set.remove(request_id)
|
||||
|
||||
def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs):
|
||||
"""
|
||||
If the request cannot be scheduled, preempt the running request one by one until it can be scheduled. Last in, first out.
|
||||
"""
|
||||
can_schedule = True
|
||||
while True:
|
||||
if not self.cache_manager.can_allocate_gpu_blocks(num_new_blocks):
|
||||
preempted_req = self.running.pop()
|
||||
preempted_req.status = RequestStatus.PREEMPTED
|
||||
preempted_req.num_computed_tokens = 0
|
||||
preempted_req.prefill_block_num = 0
|
||||
self._free_blocks(preempted_req)
|
||||
self.waiting.appendleft(preempted_req)
|
||||
preempted_req.prefill_block_num = None
|
||||
self.to_be_rescheduled_request_id_set.add(preempted_req.request_id)
|
||||
preempted_reqs.append(preempted_req)
|
||||
scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
|
||||
if preempted_req == request:
|
||||
@@ -134,26 +147,31 @@ class ResourceManagerV1(ResourceManager):
|
||||
|
||||
input_ids_lst = request.prompt_token_ids + request.output_token_ids
|
||||
input_ids = paddle.to_tensor(input_ids_lst, dtype="int64")
|
||||
grid_thw = []
|
||||
for one in inputs["grid_thw"]:
|
||||
if one[0] == 1:
|
||||
grid_thw.append(one)
|
||||
else:
|
||||
grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2))
|
||||
|
||||
input_ids = paddle.to_tensor(input_ids_lst, dtype="int64")
|
||||
image_patch_id = inputs["image_patch_id"]
|
||||
grid_thw = paddle.to_tensor(grid_thw, dtype="int64")
|
||||
|
||||
if request.multimodal_img_boundaries is None:
|
||||
grid_thw = []
|
||||
for one in inputs["grid_thw"]:
|
||||
if one[0] == 1:
|
||||
grid_thw.append(one)
|
||||
else:
|
||||
grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2))
|
||||
|
||||
grid_thw = paddle.to_tensor(grid_thw, dtype="int64")
|
||||
from fastdeploy.model_executor.ops.gpu import get_img_boundaries
|
||||
|
||||
request.multimodal_img_boundaries = get_img_boundaries(
|
||||
task_input_ids=input_ids, grid_thw=grid_thw, image_patch_id=image_patch_id
|
||||
).numpy()
|
||||
|
||||
grid_thw = grid_thw.numpy().reshape([-1, 3])
|
||||
inputs["grid_thw"] = grid_thw
|
||||
|
||||
grid_thw = inputs["grid_thw"]
|
||||
img_boundaries_idx = request.multimodal_img_boundaries[0]
|
||||
img_num_per_boundary = request.multimodal_img_boundaries[1]
|
||||
ori_prompt_len = img_boundaries_idx[-1].item()
|
||||
grid_thw = grid_thw.numpy().reshape([-1, 3])
|
||||
pre_end_idx = request.num_computed_tokens
|
||||
new_end_idx = pre_end_idx + num_new_tokens
|
||||
if new_end_idx < ori_prompt_len and input_ids[new_end_idx - 1] == image_patch_id:
|
||||
@@ -187,7 +205,6 @@ class ResourceManagerV1(ResourceManager):
|
||||
)
|
||||
request.num_image_end = img_num_per_boundary[new_boundary_idx]
|
||||
|
||||
request.num_image_end = img_num_per_boundary[new_boundary_idx]
|
||||
request.image_type_ids_start = np.sum(grid_thw[: request.num_image_start, 0])
|
||||
request.image_type_ids_end = np.sum(grid_thw[: request.num_image_end, 0])
|
||||
request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))
|
||||
@@ -201,6 +218,9 @@ class ResourceManagerV1(ResourceManager):
|
||||
return False
|
||||
|
||||
def schedule(self):
|
||||
"""
|
||||
Try to pull a batch of requests from the waiting queue and schedule them.
|
||||
"""
|
||||
with self.lock:
|
||||
scheduled_reqs: list[Request] = []
|
||||
preempted_reqs: list[Request] = []
|
||||
@@ -262,7 +282,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
|
||||
# Prepare prefill task
|
||||
scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
|
||||
else:
|
||||
else: # Not enough blocks to allocate, trigger preemption
|
||||
can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, scheduled_reqs)
|
||||
if not can_schedule:
|
||||
break
|
||||
@@ -277,7 +297,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
while self.waiting and token_budget > 0:
|
||||
if len(self.running) == self.max_num_seqs:
|
||||
break
|
||||
if self.config.enable_mm and self.exist_prefill(scheduled_reqs):
|
||||
if (self.config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(scheduled_reqs):
|
||||
break
|
||||
request = self.waiting[0]
|
||||
if request.status == RequestStatus.WAITING:
|
||||
@@ -285,6 +305,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
success = self.get_prefix_cached_blocks(request)
|
||||
if not success:
|
||||
self._free_blocks(request)
|
||||
break
|
||||
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
@@ -307,16 +328,24 @@ class ResourceManagerV1(ResourceManager):
|
||||
self.stop_flags[allocated_position] = False
|
||||
self.req_dict[request.request_id] = allocated_position
|
||||
else:
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
self._free_blocks(request)
|
||||
break
|
||||
elif request.status == RequestStatus.PREEMPTED:
|
||||
request.need_prefill_tokens = (
|
||||
request.num_total_tokens
|
||||
) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
success = self.get_prefix_cached_blocks(request)
|
||||
if not success:
|
||||
self._free_blocks(request)
|
||||
break
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
# Allocate blocks to prefill
|
||||
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
|
||||
if not request.get("skip_allocate", False):
|
||||
request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block))
|
||||
self.waiting.popleft()
|
||||
self.running.append(request)
|
||||
scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
|
||||
@@ -324,10 +353,16 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.num_computed_tokens += num_new_tokens
|
||||
request.status = RequestStatus.RUNNING
|
||||
else:
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
self._free_blocks(request)
|
||||
break
|
||||
else:
|
||||
llm_logger.error("Unknown request status type")
|
||||
if scheduled_reqs:
|
||||
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
||||
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
|
||||
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
||||
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
||||
llm_logger.debug(f"schedued_reqs: {scheduled_reqs}")
|
||||
return scheduled_reqs
|
||||
|
||||
@@ -369,8 +404,13 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.block_tables = common_block_ids
|
||||
request.skip_allocate = False
|
||||
|
||||
# Report the number of cached tokens to Prometheus metrics
|
||||
main_process_metrics.prefix_cache_token_num.inc(matched_token_num)
|
||||
main_process_metrics.prefix_gpu_cache_token_num.inc(request.gpu_cache_token_num)
|
||||
main_process_metrics.prefix_cpu_cache_token_num.inc(request.cpu_cache_token_num)
|
||||
|
||||
if matched_token_num == request.prompt_token_ids_len:
|
||||
request.num_computed_tokens = matched_token_num - 1
|
||||
request.num_computed_tokens = matched_token_num - self.config.cache_config.block_size
|
||||
request.skip_allocate = True
|
||||
else:
|
||||
request.num_computed_tokens = matched_token_num
|
||||
@@ -381,14 +421,22 @@ class ResourceManagerV1(ResourceManager):
|
||||
return False
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
self.waiting.append(request)
|
||||
self.requests[request.request_id] = request
|
||||
with self.lock:
|
||||
self.waiting.append(request)
|
||||
self.requests[request.request_id] = request
|
||||
|
||||
def _free_blocks(self, request: Request):
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
# TODO(chengyanfu): support cache ouput blocks for prefix caching
|
||||
self.cache_manager.release_block_ids_async(request)
|
||||
self.cache_manager.recycle_gpu_blocks(request.block_tables[request.prefill_block_num :])
|
||||
if request.get("prefill_block_num", None) is None:
|
||||
leaf_node = self.cache_manager.req_leaf_map[request.request_id]
|
||||
self.cache_manager.decrease_request_share_count(request.request_id)
|
||||
self.cache_manager.free_nodes_directly(leaf_node)
|
||||
self.cache_manager.recycle_gpu_blocks(request.block_tables[request.cache_info[0]:])
|
||||
|
||||
else:
|
||||
self.cache_manager.release_block_ids_async(request)
|
||||
self.cache_manager.recycle_gpu_blocks(request.block_tables[request.prefill_block_num :])
|
||||
else:
|
||||
self.cache_manager.recycle_gpu_blocks(request.block_tables)
|
||||
request.block_tables = []
|
||||
@@ -409,9 +457,20 @@ class ResourceManagerV1(ResourceManager):
|
||||
if request is None:
|
||||
# Invalid request ID.
|
||||
continue
|
||||
request.status = RequestStatus.FINISHED
|
||||
self.running.remove(request)
|
||||
self._free_blocks(request)
|
||||
if request in self.running: # normally run and finished
|
||||
self.running.remove(request)
|
||||
request.status = RequestStatus.FINISHED
|
||||
self._free_blocks(request)
|
||||
if (
|
||||
request.request_id in self.to_be_rescheduled_request_id_set
|
||||
): # finished after preempted, blocks have been recycled.
|
||||
self.to_be_rescheduled_request_id_set.remove(
|
||||
request.request_id
|
||||
) # just remove from to_be_rescheduled_request_id_set
|
||||
if (
|
||||
request in self.waiting
|
||||
): # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here
|
||||
raise RuntimeError(f"request {request.request_id} scheduled into waiting list, after finished")
|
||||
self.tasks_list[request.idx] = None
|
||||
self.stop_flags[request.idx] = True
|
||||
del self.requests[req_id]
|
||||
|
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from copy import deepcopy
|
||||
from typing import List, Literal, Union
|
||||
from urllib.parse import urlparse
|
||||
@@ -156,3 +157,7 @@ def parse_chat_messages(messages):
|
||||
|
||||
conversation.append({"role": role, "content": parsed_content})
|
||||
return conversation
|
||||
|
||||
|
||||
def random_tool_call_id() -> str:
|
||||
return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
|
||||
|
@@ -19,11 +19,12 @@ import uuid
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.input.preprocess import InputPreprocessor
|
||||
from fastdeploy.inter_communicator import IPCSignal, ZmqClient
|
||||
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
||||
from fastdeploy.metrics.work_metrics import work_process_metrics
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import EngineError, api_server_logger
|
||||
from fastdeploy.utils import EngineError, StatefulSemaphore, api_server_logger
|
||||
|
||||
|
||||
class EngineClient:
|
||||
@@ -43,6 +44,8 @@ class EngineClient:
|
||||
reasoning_parser=None,
|
||||
data_parallel_size=1,
|
||||
enable_logprob=False,
|
||||
workers=1,
|
||||
tool_parser=None,
|
||||
):
|
||||
input_processor = InputPreprocessor(
|
||||
tokenizer,
|
||||
@@ -50,6 +53,7 @@ class EngineClient:
|
||||
limit_mm_per_prompt,
|
||||
mm_processor_kwargs,
|
||||
enable_mm,
|
||||
tool_parser,
|
||||
)
|
||||
self.enable_logprob = enable_logprob
|
||||
self.enable_mm = enable_mm
|
||||
@@ -75,12 +79,13 @@ class EngineClient:
|
||||
suffix=pid,
|
||||
create=False,
|
||||
)
|
||||
self.semaphore = StatefulSemaphore((envs.FD_SUPPORT_MAX_CONNECTIONS + workers - 1) // workers)
|
||||
|
||||
def create_zmq_client(self, model, mode):
|
||||
"""
|
||||
Create a ZMQ client.
|
||||
"""
|
||||
self.zmq_client = ZmqClient(model, mode)
|
||||
self.zmq_client = ZmqIpcClient(model, mode)
|
||||
self.zmq_client.connect()
|
||||
|
||||
def format_and_add_data(self, prompts: dict):
|
||||
@@ -116,8 +121,6 @@ class EngineClient:
|
||||
task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
|
||||
input_ids_len = task["prompt_token_ids_len"]
|
||||
task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
|
||||
if task.get("reasoning_max_tokens", None) is None:
|
||||
task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1)
|
||||
min_tokens = task.get("min_tokens", 1)
|
||||
if "messages" in task:
|
||||
del task["messages"]
|
||||
@@ -144,6 +147,26 @@ class EngineClient:
|
||||
api_server_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
|
||||
if "stop_seqs_len" in task:
|
||||
stop_seqs_len = task["stop_seqs_len"]
|
||||
max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
||||
if len(stop_seqs_len) > max_stop_seqs_num:
|
||||
error_msg = (
|
||||
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
||||
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
||||
)
|
||||
api_server_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
||||
for single_stop_seq_len in stop_seqs_len:
|
||||
if single_stop_seq_len > stop_seqs_max_len:
|
||||
error_msg = (
|
||||
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
||||
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
||||
)
|
||||
api_server_logger.error(error_msg)
|
||||
raise EngineError(error_msg, error_code=400)
|
||||
|
||||
task["preprocess_end_time"] = time.time()
|
||||
preprocess_cost_time = task["preprocess_end_time"] - task["preprocess_start_time"]
|
||||
api_server_logger.info(
|
||||
@@ -167,35 +190,35 @@ class EngineClient:
|
||||
Validate stream options
|
||||
"""
|
||||
|
||||
if data.get("n"):
|
||||
if data.get("n") is not None:
|
||||
if data["n"] != 1:
|
||||
raise ValueError("n only support 1.")
|
||||
|
||||
if data.get("max_tokens"):
|
||||
if data.get("max_tokens") is not None:
|
||||
if data["max_tokens"] < 1 or data["max_tokens"] >= self.max_model_len:
|
||||
raise ValueError(f"max_tokens can be defined [1, {self.max_model_len}).")
|
||||
|
||||
if data.get("reasoning_max_tokens"):
|
||||
if data.get("reasoning_max_tokens") is not None:
|
||||
if data["reasoning_max_tokens"] > data["max_tokens"] or data["reasoning_max_tokens"] < 1:
|
||||
raise ValueError("reasoning_max_tokens must be between max_tokens and 1")
|
||||
|
||||
if data.get("top_p"):
|
||||
if data.get("top_p") is not None:
|
||||
if data["top_p"] > 1 or data["top_p"] < 0:
|
||||
raise ValueError("top_p value can only be defined [0, 1].")
|
||||
|
||||
if data.get("frequency_penalty"):
|
||||
if data.get("frequency_penalty") is not None:
|
||||
if not -2.0 <= data["frequency_penalty"] <= 2.0:
|
||||
raise ValueError("frequency_penalty must be in [-2, 2]")
|
||||
|
||||
if data.get("temperature"):
|
||||
if data.get("temperature") is not None:
|
||||
if data["temperature"] < 0:
|
||||
raise ValueError("temperature must be non-negative")
|
||||
|
||||
if data.get("presence_penalty"):
|
||||
if data.get("presence_penalty") is not None:
|
||||
if not -2.0 <= data["presence_penalty"] <= 2.0:
|
||||
raise ValueError("presence_penalty must be in [-2, 2]")
|
||||
|
||||
if data.get("seed"):
|
||||
if data.get("seed") is not None:
|
||||
if not 0 <= data["seed"] <= 922337203685477580:
|
||||
raise ValueError("seed must be in [0, 922337203685477580]")
|
||||
|
||||
|
@@ -28,8 +28,7 @@ from tqdm import tqdm
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.engine import LLMEngine
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
|
||||
# from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from fastdeploy.utils import llm_logger, retrive_model_from_server
|
||||
from fastdeploy.worker.output import Logprob, LogprobsLists
|
||||
|
||||
@@ -73,6 +72,9 @@ class LLM:
|
||||
**kwargs,
|
||||
):
|
||||
model = retrive_model_from_server(model, revision)
|
||||
tool_parser_plugin = kwargs.get("tool_parser_plugin")
|
||||
if tool_parser_plugin:
|
||||
ToolParserManager.import_tool_parser(tool_parser_plugin)
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
@@ -238,7 +240,7 @@ class LLM:
|
||||
self,
|
||||
prompts,
|
||||
sampling_params,
|
||||
chat_template_kwargs: Optional[dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
添加一个请求到 LLM Engine,并返回该请求的 ID。
|
||||
@@ -279,12 +281,13 @@ class LLM:
|
||||
current_sampling_params = sampling_params[i]
|
||||
else:
|
||||
current_sampling_params = sampling_params
|
||||
enable_thinking = None
|
||||
if chat_template_kwargs is not None:
|
||||
enable_thinking = chat_template_kwargs.get("enable_thinking", None)
|
||||
self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking)
|
||||
self.llm_engine.add_requests(tasks, current_sampling_params, **kwargs)
|
||||
return req_ids
|
||||
|
||||
def _decode_token(self, token_id: int) -> str:
|
||||
"""Decodes a single token ID into its string representation."""
|
||||
return self.llm_engine.data_processor.process_logprob_response([token_id], clean_up_tokenization_spaces=False)
|
||||
|
||||
def _build_sample_logprobs(self, logprobs_lists: LogprobsLists, topk_logprobs: int) -> list[dict[int, Logprob]]:
|
||||
"""
|
||||
Constructs a list of dictionaries mapping token IDs to Logprob objects,
|
||||
@@ -318,8 +321,9 @@ class LLM:
|
||||
sliced_logprobs_lists = logprobs_lists.slice_columns(1, 1 + effective_topk_logprobs)
|
||||
result = []
|
||||
for token_ids, logprobs in zip(sliced_logprobs_lists.logprob_token_ids, sliced_logprobs_lists.logprobs):
|
||||
|
||||
logprob_dict = {
|
||||
token_id: Logprob(logprob=logprob, rank=i + 1, decoded_token=None)
|
||||
token_id: Logprob(logprob=logprob, rank=i + 1, decoded_token=self._decode_token(token_id))
|
||||
for i, (token_id, logprob) in enumerate(zip(token_ids, logprobs))
|
||||
}
|
||||
result.append(logprob_dict)
|
||||
|
@@ -14,15 +14,17 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
from multiprocessing import current_process
|
||||
|
||||
import uvicorn
|
||||
import zmq
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
from prometheus_client import CONTENT_TYPE_LATEST
|
||||
|
||||
@@ -39,15 +41,17 @@ from fastdeploy.entrypoints.openai.protocol import (
|
||||
)
|
||||
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from fastdeploy.metrics.metrics import (
|
||||
EXCLUDE_LABELS,
|
||||
cleanup_prometheus_files,
|
||||
get_filtered_metrics,
|
||||
main_process_metrics,
|
||||
)
|
||||
from fastdeploy.metrics.trace_util import inject_to_metadata, instrument
|
||||
from fastdeploy.metrics.trace_util import fd_start_span, inject_to_metadata, instrument
|
||||
from fastdeploy.utils import (
|
||||
FlexibleArgumentParser,
|
||||
StatefulSemaphore,
|
||||
api_server_logger,
|
||||
console_logger,
|
||||
is_port_available,
|
||||
@@ -60,10 +64,18 @@ parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the htt
|
||||
parser.add_argument("--workers", default=1, type=int, help="number of workers")
|
||||
parser.add_argument("--metrics-port", default=8001, type=int, help="port for metrics server")
|
||||
parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server")
|
||||
parser.add_argument(
|
||||
"--max-waiting-time",
|
||||
default=-1,
|
||||
type=int,
|
||||
help="max waiting time for connection, if set value -1 means no waiting time limit",
|
||||
)
|
||||
parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
args.model = retrive_model_from_server(args.model, args.revision)
|
||||
|
||||
if args.tool_parser_plugin:
|
||||
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
||||
llm_engine = None
|
||||
|
||||
|
||||
@@ -115,10 +127,12 @@ async def lifespan(app: FastAPI):
|
||||
args.reasoning_parser,
|
||||
args.data_parallel_size,
|
||||
args.enable_logprob,
|
||||
args.workers,
|
||||
args.tool_call_parser,
|
||||
)
|
||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
|
||||
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips)
|
||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
|
||||
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
|
||||
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
|
||||
engine_client.pid = pid
|
||||
app.state.engine_client = engine_client
|
||||
@@ -140,6 +154,41 @@ app = FastAPI(lifespan=lifespan)
|
||||
instrument(app)
|
||||
|
||||
|
||||
MAX_CONCURRENT_CONNECTIONS = (args.max_concurrency + args.workers - 1) // args.workers
|
||||
connection_semaphore = StatefulSemaphore(MAX_CONCURRENT_CONNECTIONS)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def connection_manager():
|
||||
"""
|
||||
async context manager for connection manager
|
||||
"""
|
||||
try:
|
||||
await asyncio.wait_for(connection_semaphore.acquire(), timeout=0.001)
|
||||
yield
|
||||
except asyncio.TimeoutError:
|
||||
api_server_logger.info(f"Reach max request release: {connection_semaphore.status()}")
|
||||
raise HTTPException(
|
||||
status_code=429, detail=f"Too many requests, current max concurrency is {args.max_concurrency}"
|
||||
)
|
||||
|
||||
|
||||
def wrap_streaming_generator(original_generator: AsyncGenerator):
|
||||
"""
|
||||
Wrap an async generator to release the connection semaphore when the generator is finished.
|
||||
"""
|
||||
|
||||
async def wrapped_generator():
|
||||
try:
|
||||
async for chunk in original_generator:
|
||||
yield chunk
|
||||
finally:
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
connection_semaphore.release()
|
||||
|
||||
return wrapped_generator
|
||||
|
||||
|
||||
# TODO 传递真实引擎值 通过pid 获取状态
|
||||
@app.get("/health")
|
||||
def health(request: Request) -> Response:
|
||||
@@ -198,20 +247,30 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
"""
|
||||
Create a chat completion for the provided prompt and parameters.
|
||||
"""
|
||||
api_server_logger.info(f"Chat Received request: {request.model_dump_json()}")
|
||||
if app.state.dynamic_load_weight:
|
||||
status, msg = app.state.engine_client.is_workers_alive()
|
||||
if not status:
|
||||
return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)
|
||||
inject_to_metadata(request)
|
||||
generator = await app.state.chat_handler.create_chat_completion(request)
|
||||
try:
|
||||
async with connection_manager():
|
||||
inject_to_metadata(request)
|
||||
generator = await app.state.chat_handler.create_chat_completion(request)
|
||||
if isinstance(generator, ErrorResponse):
|
||||
connection_semaphore.release()
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
return JSONResponse(content={"detail": generator.model_dump()}, status_code=generator.code)
|
||||
elif isinstance(generator, ChatCompletionResponse):
|
||||
connection_semaphore.release()
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
else:
|
||||
wrapped_generator = wrap_streaming_generator(generator)
|
||||
return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
|
||||
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(content=generator.model_dump(), status_code=generator.code)
|
||||
|
||||
elif isinstance(generator, ChatCompletionResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||
except HTTPException as e:
|
||||
api_server_logger.error(f"Error in chat completion: {str(e)}")
|
||||
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
|
||||
|
||||
|
||||
@app.post("/v1/completions")
|
||||
@@ -219,18 +278,26 @@ async def create_completion(request: CompletionRequest):
|
||||
"""
|
||||
Create a completion for the provided prompt and parameters.
|
||||
"""
|
||||
api_server_logger.info(f"Completion Received request: {request.model_dump_json()}")
|
||||
if app.state.dynamic_load_weight:
|
||||
status, msg = app.state.engine_client.is_workers_alive()
|
||||
if not status:
|
||||
return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)
|
||||
|
||||
generator = await app.state.completion_handler.create_completion(request)
|
||||
if isinstance(generator, ErrorResponse):
|
||||
return JSONResponse(content=generator.model_dump(), status_code=generator.code)
|
||||
elif isinstance(generator, CompletionResponse):
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
|
||||
return StreamingResponse(content=generator, media_type="text/event-stream")
|
||||
try:
|
||||
async with connection_manager():
|
||||
generator = await app.state.completion_handler.create_completion(request)
|
||||
if isinstance(generator, ErrorResponse):
|
||||
connection_semaphore.release()
|
||||
return JSONResponse(content=generator.model_dump(), status_code=generator.code)
|
||||
elif isinstance(generator, CompletionResponse):
|
||||
connection_semaphore.release()
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
else:
|
||||
wrapped_generator = wrap_streaming_generator(generator)
|
||||
return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
|
||||
except HTTPException as e:
|
||||
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
|
||||
|
||||
|
||||
@app.get("/update_model_weight")
|
||||
@@ -270,6 +337,7 @@ def launch_api_server() -> None:
|
||||
|
||||
api_server_logger.info(f"launch Fastdeploy api server... port: {args.port}")
|
||||
api_server_logger.info(f"args: {args.__dict__}")
|
||||
fd_start_span("FD_START")
|
||||
|
||||
try:
|
||||
uvicorn.run(
|
||||
|
@@ -72,7 +72,6 @@ class ToolCall(BaseModel):
|
||||
id: str = None
|
||||
type: Literal["function"] = "function"
|
||||
function: FunctionCall
|
||||
index: int
|
||||
|
||||
|
||||
class DeltaFunctionCall(BaseModel):
|
||||
@@ -96,6 +95,18 @@ class DeltaToolCall(BaseModel):
|
||||
function: Optional[DeltaFunctionCall] = None
|
||||
|
||||
|
||||
class ExtractedToolCallInformation(BaseModel):
|
||||
# indicate if tools were called
|
||||
tools_called: bool
|
||||
|
||||
# extracted tool calls
|
||||
tool_calls: Optional[list[ToolCall]] = None
|
||||
|
||||
# content - per OpenAI spec, content AND tool calls can be returned rarely
|
||||
# But some models will do this intentionally
|
||||
content: Optional[str] = None
|
||||
|
||||
|
||||
class FunctionDefinition(BaseModel):
|
||||
"""
|
||||
Function definition.
|
||||
@@ -126,6 +137,10 @@ class ChatMessage(BaseModel):
|
||||
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
|
||||
prompt_token_ids: Optional[List[int]] = None
|
||||
completion_token_ids: Optional[List[int]] = None
|
||||
text_after_process: Optional[str] = None
|
||||
raw_prediction: Optional[str] = None
|
||||
prompt_tokens: Optional[str] = None
|
||||
completion_tokens: Optional[str] = None
|
||||
|
||||
|
||||
class ChatCompletionResponseChoice(BaseModel):
|
||||
@@ -183,6 +198,10 @@ class DeltaMessage(BaseModel):
|
||||
completion_token_ids: Optional[List[int]] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
|
||||
text_after_process: Optional[str] = None
|
||||
raw_prediction: Optional[str] = None
|
||||
prompt_tokens: Optional[str] = None
|
||||
completion_tokens: Optional[str] = None
|
||||
|
||||
|
||||
class ChatCompletionResponseStreamChoice(BaseModel):
|
||||
@@ -219,6 +238,10 @@ class CompletionResponseChoice(BaseModel):
|
||||
text: str
|
||||
prompt_token_ids: Optional[List[int]] = None
|
||||
completion_token_ids: Optional[List[int]] = None
|
||||
text_after_process: Optional[str] = None
|
||||
raw_prediction: Optional[str] = None
|
||||
prompt_tokens: Optional[str] = None
|
||||
completion_tokens: Optional[str] = None
|
||||
arrival_time: Optional[float] = None
|
||||
logprobs: Optional[CompletionLogprobs] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
@@ -261,6 +284,10 @@ class CompletionResponseStreamChoice(BaseModel):
|
||||
logprobs: Optional[CompletionLogprobs] = None
|
||||
prompt_token_ids: Optional[List[int]] = None
|
||||
completion_token_ids: Optional[List[int]] = None
|
||||
text_after_process: Optional[str] = None
|
||||
raw_prediction: Optional[str] = None
|
||||
prompt_tokens: Optional[str] = None
|
||||
completion_tokens: Optional[str] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
|
||||
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
|
||||
@@ -547,12 +574,13 @@ class ChatCompletionRequest(BaseModel):
|
||||
if "messages" in req_dict:
|
||||
del req_dict["messages"]
|
||||
else:
|
||||
assert len(self.messages) > 0
|
||||
|
||||
# If disable_chat_template is set, then the first message in messages will be used as the prompt.
|
||||
if self.disable_chat_template:
|
||||
req_dict["prompt"] = req_dict["messages"][0]["content"]
|
||||
del req_dict["messages"]
|
||||
# If disable_chat_template is set, then the first message in messages will be used as the prompt.
|
||||
assert (
|
||||
len(req_dict["messages"]) > 0
|
||||
), "messages can not be an empty list, unless prompt_token_ids is passed"
|
||||
if self.disable_chat_template:
|
||||
req_dict["prompt"] = req_dict["messages"][0]["content"]
|
||||
del req_dict["messages"]
|
||||
|
||||
guided_json_object = None
|
||||
if self.response_format is not None:
|
||||
|
@@ -49,10 +49,11 @@ class OpenAIServingChat:
|
||||
OpenAI-style chat completions serving
|
||||
"""
|
||||
|
||||
def __init__(self, engine_client, pid, ips):
|
||||
def __init__(self, engine_client, pid, ips, max_waiting_time):
|
||||
self.engine_client = engine_client
|
||||
self.pid = pid
|
||||
self.master_ip = ips
|
||||
self.max_waiting_time = max_waiting_time
|
||||
self.host_ip = get_host_ip()
|
||||
if self.master_ip is not None:
|
||||
if isinstance(self.master_ip, list):
|
||||
@@ -77,31 +78,46 @@ class OpenAIServingChat:
|
||||
api_server_logger.error(err_msg)
|
||||
return ErrorResponse(message=err_msg, code=400)
|
||||
|
||||
if request.user is not None:
|
||||
request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
|
||||
else:
|
||||
request_id = f"chatcmpl-{uuid.uuid4()}"
|
||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
||||
|
||||
try:
|
||||
current_req_dict = request.to_dict_for_infer(request_id)
|
||||
current_req_dict["arrival_time"] = time.time()
|
||||
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
|
||||
if isinstance(prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = prompt_token_ids.tolist()
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
api_server_logger.debug(f"current waiting request {self.engine_client.semaphore.status()}")
|
||||
|
||||
del current_req_dict
|
||||
|
||||
if request.stream:
|
||||
return self.chat_completion_stream_generator(request, request_id, request.model, prompt_token_ids)
|
||||
else:
|
||||
if request.user is not None:
|
||||
request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
|
||||
else:
|
||||
request_id = f"chatcmpl-{uuid.uuid4()}"
|
||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
||||
text_after_process = None
|
||||
try:
|
||||
return await self.chat_completion_full_generator(request, request_id, request.model, prompt_token_ids)
|
||||
current_req_dict = request.to_dict_for_infer(request_id)
|
||||
current_req_dict["arrival_time"] = time.time()
|
||||
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
|
||||
text_after_process = current_req_dict.get("text_after_process")
|
||||
if isinstance(prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = prompt_token_ids.tolist()
|
||||
except Exception as e:
|
||||
self.engine_client.semaphore.release()
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
|
||||
del current_req_dict
|
||||
|
||||
if request.stream:
|
||||
return self.chat_completion_stream_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
else:
|
||||
try:
|
||||
return await self.chat_completion_full_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
|
||||
def _create_streaming_error_response(self, message: str) -> str:
|
||||
error_response = ErrorResponse(
|
||||
code=400,
|
||||
@@ -115,6 +131,7 @@ class OpenAIServingChat:
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
prompt_token_ids: list(),
|
||||
text_after_process: str,
|
||||
):
|
||||
"""
|
||||
Streaming chat completion generator.
|
||||
@@ -125,6 +142,7 @@ class OpenAIServingChat:
|
||||
previous_num_tokens = 0
|
||||
num_prompt_tokens = 0
|
||||
num_choices = 1
|
||||
tool_called = False
|
||||
max_streaming_response_tokens = (
|
||||
request.max_streaming_response_tokens
|
||||
if request.max_streaming_response_tokens is not None
|
||||
@@ -207,6 +225,8 @@ class OpenAIServingChat:
|
||||
)
|
||||
if request.return_token_ids:
|
||||
choice.delta.prompt_token_ids = list(prompt_token_ids)
|
||||
choice.delta.text_after_process = text_after_process
|
||||
choice.delta.prompt_tokens = text_after_process
|
||||
chunk = ChatCompletionStreamResponse(
|
||||
id=request_id,
|
||||
object=chunk_object_type,
|
||||
@@ -222,25 +242,35 @@ class OpenAIServingChat:
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
|
||||
)
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
|
||||
api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
|
||||
first_iteration = False
|
||||
|
||||
output = res["outputs"]
|
||||
delta_text = output["text"]
|
||||
output_top_logprobs = output["top_logprobs"]
|
||||
previous_num_tokens += len(output["token_ids"])
|
||||
logprobs_res: Optional[LogProbs] = None
|
||||
if request.logprobs and output_top_logprobs is not None:
|
||||
logprobs_res = self._create_chat_logprobs(
|
||||
output_top_logprobs, request.logprobs, request.top_logprobs
|
||||
)
|
||||
|
||||
previous_num_tokens += len(output["token_ids"])
|
||||
delta_message = DeltaMessage(
|
||||
content=delta_text,
|
||||
reasoning_content=output.get("reasoning_content"),
|
||||
reasoning_content="",
|
||||
prompt_token_ids=None,
|
||||
completion_token_ids=None,
|
||||
tool_calls=output.get("tool_call_content", []),
|
||||
tool_calls=None,
|
||||
)
|
||||
if not res["finished"] and "delta_message" in output:
|
||||
delta_message_output = output["delta_message"]
|
||||
if delta_message_output is None:
|
||||
continue
|
||||
delta_message.content = delta_message_output.content or ""
|
||||
delta_message.reasoning_content = delta_message_output.reasoning_content or ""
|
||||
if delta_message_output.tool_calls:
|
||||
delta_message.tool_calls = delta_message_output.tool_calls
|
||||
tool_called = True
|
||||
|
||||
choice = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
@@ -248,6 +278,7 @@ class OpenAIServingChat:
|
||||
logprobs=logprobs_res,
|
||||
arrival_time=arrival_time,
|
||||
)
|
||||
|
||||
if res["finished"]:
|
||||
num_choices -= 1
|
||||
work_process_metrics.e2e_request_latency.observe(
|
||||
@@ -257,10 +288,7 @@ class OpenAIServingChat:
|
||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||
if has_no_token_limit or previous_num_tokens != max_tokens:
|
||||
choice.finish_reason = "stop"
|
||||
if (
|
||||
self.engine_client.reasoning_parser == "ernie_x1"
|
||||
and output.get("finish_reason", "") == "tool_calls"
|
||||
):
|
||||
if tool_called:
|
||||
choice.finish_reason = "tool_calls"
|
||||
else:
|
||||
choice.finish_reason = "length"
|
||||
@@ -270,6 +298,8 @@ class OpenAIServingChat:
|
||||
|
||||
if request.return_token_ids:
|
||||
choice.delta.completion_token_ids = list(output["token_ids"])
|
||||
choice.delta.raw_prediction = output.get("raw_prediction")
|
||||
choice.delta.completion_tokens = output.get("raw_prediction")
|
||||
if include_continuous_usage:
|
||||
chunk.usage = UsageInfo(
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
@@ -281,6 +311,9 @@ class OpenAIServingChat:
|
||||
if len(choices) == max_streaming_response_tokens or res["finished"]:
|
||||
chunk.choices = choices
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
# 打印尾包
|
||||
if res["finished"]:
|
||||
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
|
||||
choices = []
|
||||
|
||||
if choices:
|
||||
@@ -310,6 +343,8 @@ class OpenAIServingChat:
|
||||
yield f"data: {error_data}\n\n"
|
||||
finally:
|
||||
dealer.close()
|
||||
self.engine_client.semaphore.release()
|
||||
api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
async def chat_completion_full_generator(
|
||||
@@ -318,6 +353,7 @@ class OpenAIServingChat:
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
prompt_token_ids: list(),
|
||||
text_after_process: str,
|
||||
):
|
||||
"""
|
||||
Full chat completion generator.
|
||||
@@ -384,6 +420,8 @@ class OpenAIServingChat:
|
||||
break
|
||||
finally:
|
||||
dealer.close()
|
||||
self.engine_client.semaphore.release()
|
||||
api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
|
||||
|
||||
choices = []
|
||||
output = final_res["outputs"]
|
||||
@@ -391,9 +429,13 @@ class OpenAIServingChat:
|
||||
role="assistant",
|
||||
content=output["text"],
|
||||
reasoning_content=output.get("reasoning_content"),
|
||||
tool_calls=output.get("tool_call_content"),
|
||||
tool_calls=output.get("tool_call"),
|
||||
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
|
||||
completion_token_ids=completion_token_ids if request.return_token_ids else None,
|
||||
text_after_process=text_after_process if request.return_token_ids else None,
|
||||
prompt_tokens=text_after_process if request.return_token_ids else None,
|
||||
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
completion_tokens=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
)
|
||||
logprobs_full_res = None
|
||||
if logprob_contents:
|
||||
@@ -409,7 +451,7 @@ class OpenAIServingChat:
|
||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||
if has_no_token_limit or previous_num_tokens != max_tokens:
|
||||
choice.finish_reason = "stop"
|
||||
if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls":
|
||||
if output.get("tool_call"):
|
||||
choice.finish_reason = "tool_calls"
|
||||
else:
|
||||
choice.finish_reason = "length"
|
||||
@@ -427,13 +469,15 @@ class OpenAIServingChat:
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)),
|
||||
)
|
||||
work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"])
|
||||
return ChatCompletionResponse(
|
||||
res = ChatCompletionResponse(
|
||||
id=request_id,
|
||||
created=created_time,
|
||||
model=model_name,
|
||||
choices=choices,
|
||||
usage=usage,
|
||||
)
|
||||
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
|
||||
return res
|
||||
|
||||
def _create_chat_logprobs(
|
||||
self,
|
||||
|
@@ -40,11 +40,12 @@ from fastdeploy.worker.output import LogprobsLists
|
||||
|
||||
|
||||
class OpenAIServingCompletion:
|
||||
def __init__(self, engine_client, pid, ips):
|
||||
def __init__(self, engine_client, pid, ips, max_waiting_time):
|
||||
self.engine_client = engine_client
|
||||
self.pid = pid
|
||||
self.master_ip = ips
|
||||
self.host_ip = get_host_ip()
|
||||
self.max_waiting_time = max_waiting_time
|
||||
if self.master_ip is not None:
|
||||
if isinstance(self.master_ip, list):
|
||||
self.master_ip = self.master_ip[0]
|
||||
@@ -99,20 +100,29 @@ class OpenAIServingCompletion:
|
||||
|
||||
api_server_logger.info(f"start inference for request {num_choices}")
|
||||
prompt_batched_token_ids = []
|
||||
text_after_process_list = []
|
||||
try:
|
||||
for idx, prompt in enumerate(request_prompts):
|
||||
request_id_idx = f"{request_id}-{idx}"
|
||||
current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
|
||||
try:
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
try:
|
||||
try:
|
||||
for idx, prompt in enumerate(request_prompts):
|
||||
request_id_idx = f"{request_id}-{idx}"
|
||||
current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
|
||||
current_req_dict["arrival_time"] = time.time()
|
||||
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
|
||||
if isinstance(prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = prompt_token_ids.tolist()
|
||||
text_after_process_list.append(current_req_dict.get("text_after_process"))
|
||||
prompt_batched_token_ids.append(prompt_token_ids)
|
||||
except Exception as e:
|
||||
return ErrorResponse(message=str(e), code=400)
|
||||
|
||||
del current_req_dict
|
||||
del current_req_dict
|
||||
except Exception as e:
|
||||
self.engine_client.semaphore.release()
|
||||
return ErrorResponse(message=str(e), code=400)
|
||||
|
||||
if request.stream:
|
||||
return self.completion_stream_generator(
|
||||
@@ -122,6 +132,7 @@ class OpenAIServingCompletion:
|
||||
created_time=created_time,
|
||||
model_name=request.model,
|
||||
prompt_batched_token_ids=prompt_batched_token_ids,
|
||||
text_after_process_list=text_after_process_list,
|
||||
)
|
||||
else:
|
||||
try:
|
||||
@@ -132,6 +143,7 @@ class OpenAIServingCompletion:
|
||||
created_time=created_time,
|
||||
model_name=request.model,
|
||||
prompt_batched_token_ids=prompt_batched_token_ids,
|
||||
text_after_process_list=text_after_process_list,
|
||||
)
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
@@ -147,6 +159,7 @@ class OpenAIServingCompletion:
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
prompt_batched_token_ids: list(),
|
||||
text_after_process_list: list(),
|
||||
):
|
||||
"""
|
||||
Process the full completion request with multiple choices.
|
||||
@@ -162,8 +175,8 @@ class OpenAIServingCompletion:
|
||||
|
||||
valid_results = [dict()] * num_choices
|
||||
output_tokens = [0] * num_choices
|
||||
aggregated_top_logprobs = [[[], [], []]] * num_choices
|
||||
aggregated_token_ids = [[]] * num_choices
|
||||
aggregated_top_logprobs = [[[], [], []] for _ in range(num_choices)]
|
||||
aggregated_token_ids = [[] for _ in range(num_choices)]
|
||||
completion_batched_token_ids = [[] for _ in range(num_choices)]
|
||||
current_waiting_time = 0
|
||||
while num_choices > 0:
|
||||
@@ -207,8 +220,7 @@ class OpenAIServingCompletion:
|
||||
valid_results[rid] = data
|
||||
num_choices -= 1
|
||||
break
|
||||
|
||||
return self.request_output_to_completion_response(
|
||||
res = self.request_output_to_completion_response(
|
||||
final_res_batch=valid_results,
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
@@ -216,13 +228,34 @@ class OpenAIServingCompletion:
|
||||
model_name=model_name,
|
||||
prompt_batched_token_ids=prompt_batched_token_ids,
|
||||
completion_batched_token_ids=completion_batched_token_ids,
|
||||
text_after_process_list=text_after_process_list,
|
||||
)
|
||||
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
|
||||
return res
|
||||
except Exception as e:
|
||||
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if dealer is not None:
|
||||
dealer.close()
|
||||
self.engine_client.semaphore.release()
|
||||
|
||||
async def _echo_back_prompt(self, request, res, idx):
|
||||
if res["outputs"].get("send_idx", -1) == 0 and request.echo:
|
||||
if isinstance(request.prompt, list):
|
||||
prompt_text = request.prompt[idx]
|
||||
else:
|
||||
prompt_text = request.prompt
|
||||
res["outputs"]["text"] = prompt_text + (res["outputs"]["text"] or "")
|
||||
|
||||
def calc_finish_reason(self, max_tokens, token_num, output, tool_called):
|
||||
if max_tokens is None or token_num != max_tokens:
|
||||
if tool_called or output.get("tool_call"):
|
||||
return "tool_calls"
|
||||
else:
|
||||
return "stop"
|
||||
else:
|
||||
return "length"
|
||||
|
||||
async def completion_stream_generator(
|
||||
self,
|
||||
@@ -232,6 +265,7 @@ class OpenAIServingCompletion:
|
||||
created_time: int,
|
||||
model_name: str,
|
||||
prompt_batched_token_ids: list(),
|
||||
text_after_process_list: list(),
|
||||
):
|
||||
"""
|
||||
Process the stream completion request.
|
||||
@@ -245,6 +279,7 @@ class OpenAIServingCompletion:
|
||||
output_tokens = [0] * num_choices
|
||||
inference_start_time = [0] * num_choices
|
||||
first_iteration = [True] * num_choices
|
||||
tool_called = [False] * num_choices
|
||||
max_streaming_response_tokens = (
|
||||
request.max_streaming_response_tokens
|
||||
if request.max_streaming_response_tokens is not None
|
||||
@@ -290,11 +325,16 @@ class OpenAIServingCompletion:
|
||||
index=idx,
|
||||
text="",
|
||||
prompt_token_ids=list(prompt_batched_token_ids[idx]),
|
||||
text_after_process=text_after_process_list[idx],
|
||||
prompt_tokens=text_after_process_list[idx],
|
||||
completion_token_ids=None,
|
||||
)
|
||||
],
|
||||
)
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
api_server_logger.info(
|
||||
f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
|
||||
)
|
||||
first_iteration[idx] = False
|
||||
|
||||
self.engine_client.data_processor.process_response_dict(
|
||||
@@ -306,36 +346,51 @@ class OpenAIServingCompletion:
|
||||
else:
|
||||
arrival_time = res["metrics"]["arrival_time"] - inference_start_time[idx]
|
||||
|
||||
await self._echo_back_prompt(request, res, idx)
|
||||
output = res["outputs"]
|
||||
output_top_logprobs = output["top_logprobs"]
|
||||
logprobs_res: Optional[CompletionLogprobs] = None
|
||||
if request.logprobs and output_top_logprobs is not None:
|
||||
logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)
|
||||
|
||||
choices.append(
|
||||
CompletionResponseStreamChoice(
|
||||
index=idx,
|
||||
text=output["text"],
|
||||
prompt_token_ids=None,
|
||||
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
|
||||
tool_calls=output.get("tool_call_content"),
|
||||
reasoning_content=output.get("reasoning_content"),
|
||||
arrival_time=arrival_time,
|
||||
logprobs=logprobs_res,
|
||||
)
|
||||
)
|
||||
if res["finished"]:
|
||||
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
|
||||
chunk.choices[0].finish_reason = "stop"
|
||||
if (
|
||||
self.engine_client.reasoning_parser == "ernie_x1"
|
||||
and output.get("finish_reason", "") == "tool_calls"
|
||||
):
|
||||
chunk.choices[0].finish_reason = "tool_calls"
|
||||
else:
|
||||
chunk.choices[0].finish_reason = "length"
|
||||
|
||||
output_tokens[idx] += 1
|
||||
delta_message = CompletionResponseStreamChoice(
|
||||
index=idx,
|
||||
text=output["text"],
|
||||
prompt_token_ids=None,
|
||||
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
|
||||
tool_calls=None,
|
||||
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
completion_tokens=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
reasoning_content="",
|
||||
arrival_time=arrival_time,
|
||||
logprobs=logprobs_res,
|
||||
)
|
||||
if not res["finished"] and "delta_message" in output:
|
||||
delta_message_output = output["delta_message"]
|
||||
if delta_message_output is None:
|
||||
continue
|
||||
delta_message.text = delta_message_output.content or ""
|
||||
delta_message.reasoning_content = delta_message_output.reasoning_content or ""
|
||||
if delta_message_output.tool_calls:
|
||||
delta_message.tool_calls = delta_message_output.tool_calls
|
||||
tool_called[idx] = True
|
||||
|
||||
choices.append(delta_message)
|
||||
|
||||
if res["finished"]:
|
||||
choices[-1].finish_reason = self.calc_finish_reason(
|
||||
request.max_tokens, output_tokens[idx], output, tool_called[idx]
|
||||
)
|
||||
send_idx = output.get("send_idx")
|
||||
# 只有当 send_idx 明确为 0 时才记录日志
|
||||
if send_idx == 0 and not request.return_token_ids:
|
||||
chunk_temp = chunk
|
||||
chunk_temp.choices = choices
|
||||
api_server_logger.info(
|
||||
f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
|
||||
)
|
||||
del chunk_temp
|
||||
|
||||
if len(choices) == max_streaming_response_tokens or res["finished"]:
|
||||
chunk = CompletionStreamResponse(
|
||||
@@ -358,9 +413,11 @@ class OpenAIServingCompletion:
|
||||
usage=UsageInfo(
|
||||
prompt_tokens=len(prompt_batched_token_ids[idx]),
|
||||
completion_tokens=output_tokens[idx],
|
||||
total_tokens=len(prompt_batched_token_ids[idx]) + output_tokens[idx],
|
||||
),
|
||||
)
|
||||
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
||||
if choices:
|
||||
chunk.choices = choices
|
||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||
@@ -372,6 +429,7 @@ class OpenAIServingCompletion:
|
||||
del request
|
||||
if dealer is not None:
|
||||
dealer.close()
|
||||
self.engine_client.semaphore.release()
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
def request_output_to_completion_response(
|
||||
@@ -383,43 +441,37 @@ class OpenAIServingCompletion:
|
||||
model_name: str,
|
||||
prompt_batched_token_ids: list(),
|
||||
completion_batched_token_ids: list(),
|
||||
text_after_process_list: list(),
|
||||
) -> CompletionResponse:
|
||||
choices: List[CompletionResponseChoice] = []
|
||||
num_prompt_tokens = 0
|
||||
num_generated_tokens = 0
|
||||
aggregated_logprobs: Optional[CompletionLogprobs] = None
|
||||
|
||||
for idx in range(len(final_res_batch)):
|
||||
final_res = final_res_batch[idx]
|
||||
prompt_token_ids = prompt_batched_token_ids[idx]
|
||||
assert prompt_token_ids is not None
|
||||
prompt_text = final_res["prompt"]
|
||||
prompt_text = request.prompt
|
||||
completion_token_ids = completion_batched_token_ids[idx]
|
||||
|
||||
output = final_res["outputs"]
|
||||
output_top_logprobs = output["top_logprobs"]
|
||||
|
||||
aggregated_logprobs: Optional[CompletionLogprobs] = None
|
||||
if output_top_logprobs is not None:
|
||||
logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)
|
||||
if aggregated_logprobs is None:
|
||||
aggregated_logprobs = logprobs_res
|
||||
else:
|
||||
aggregated_logprobs.tokens.extend(logprobs_res.tokens)
|
||||
aggregated_logprobs.token_logprobs.extend(logprobs_res.token_logprobs)
|
||||
aggregated_logprobs.top_logprobs.extend(logprobs_res.top_logprobs)
|
||||
aggregated_logprobs.text_offset.extend(logprobs_res.text_offset)
|
||||
aggregated_logprobs = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0)
|
||||
|
||||
if request.echo:
|
||||
assert prompt_text is not None
|
||||
if request.max_tokens == 0:
|
||||
token_ids = prompt_token_ids
|
||||
output_text = prompt_text
|
||||
token_ids = [*prompt_token_ids, *output["token_ids"]]
|
||||
if isinstance(prompt_text, list):
|
||||
output_text = prompt_text[idx] + output["text"]
|
||||
else:
|
||||
token_ids = [*prompt_token_ids, *output["token_ids"]]
|
||||
output_text = prompt_text + output["text"]
|
||||
output_text = str(prompt_text) + output["text"]
|
||||
else:
|
||||
token_ids = output["token_ids"]
|
||||
output_text = output["text"]
|
||||
finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output, False)
|
||||
|
||||
choice_data = CompletionResponseChoice(
|
||||
token_ids=token_ids,
|
||||
@@ -427,10 +479,14 @@ class OpenAIServingCompletion:
|
||||
text=output_text,
|
||||
prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
|
||||
completion_token_ids=completion_token_ids if request.return_token_ids else None,
|
||||
raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
completion_tokens=output.get("raw_prediction") if request.return_token_ids else None,
|
||||
text_after_process=text_after_process_list[idx] if request.return_token_ids else None,
|
||||
prompt_tokens=text_after_process_list[idx] if request.return_token_ids else None,
|
||||
reasoning_content=output.get("reasoning_content"),
|
||||
tool_calls=output.get("tool_call_content"),
|
||||
tool_calls=output.get("tool_call"),
|
||||
logprobs=aggregated_logprobs,
|
||||
finish_reason=None,
|
||||
finish_reason=finish_reason,
|
||||
)
|
||||
choices.append(choice_data)
|
||||
|
||||
|
24
fastdeploy/entrypoints/openai/tool_parsers/__init__.py
Normal file
24
fastdeploy/entrypoints/openai/tool_parsers/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .abstract_tool_parser import ToolParser, ToolParserManager
|
||||
from .ernie_x1_tool_parser import ErnieX1ToolParser
|
||||
|
||||
__all__ = [
|
||||
"ToolParser",
|
||||
"ToolParserManager",
|
||||
"ErnieX1ToolParser",
|
||||
]
|
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
from collections.abc import Sequence
|
||||
from functools import cached_property
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
from fastdeploy.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaMessage,
|
||||
ExtractedToolCallInformation,
|
||||
)
|
||||
from fastdeploy.utils import data_processor_logger, import_from_path, is_list_of
|
||||
|
||||
|
||||
class ToolParser:
|
||||
"""
|
||||
Abstract ToolParser class that should not be used directly. Provided
|
||||
properties and methods should be used in
|
||||
derived classes.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer):
|
||||
self.prev_tool_call_arr: list[dict] = []
|
||||
# the index of the tool call that is currently being parsed
|
||||
self.current_tool_id: int = -1
|
||||
self.current_tool_name_sent: bool = False
|
||||
self.streamed_args_for_tool: list[str] = []
|
||||
|
||||
self.model_tokenizer = tokenizer
|
||||
|
||||
@cached_property
|
||||
def vocab(self) -> dict[str, int]:
|
||||
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
|
||||
# whereas all tokenizers have .get_vocab()
|
||||
return self.model_tokenizer.get_vocab()
|
||||
|
||||
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
|
||||
"""
|
||||
Static method that used to adjust the request parameters.
|
||||
"""
|
||||
return request
|
||||
|
||||
def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
|
||||
"""
|
||||
Static method that should be implemented for extracting tool calls from
|
||||
a complete model-generated string.
|
||||
Used for non-streaming responses where we have the entire model response
|
||||
available before sending to the client.
|
||||
Static because it's stateless.
|
||||
"""
|
||||
raise NotImplementedError("AbstractToolParser.extract_tool_calls has not been implemented!")
|
||||
|
||||
def extract_tool_calls_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
request: ChatCompletionRequest,
|
||||
) -> Union[DeltaMessage, None]:
|
||||
"""
|
||||
Instance method that should be implemented for extracting tool calls
|
||||
from an incomplete response; for use when handling tool calls and
|
||||
streaming. Has to be an instance method because it requires state -
|
||||
the current tokens/diffs, but also the information about what has
|
||||
previously been parsed and extracted (see constructor)
|
||||
"""
|
||||
raise NotImplementedError("AbstractToolParser.extract_tool_calls_streaming has not been " "implemented!")
|
||||
|
||||
|
||||
class ToolParserManager:
|
||||
tool_parsers: dict[str, type] = {}
|
||||
|
||||
@classmethod
|
||||
def get_tool_parser(cls, name) -> type:
|
||||
"""
|
||||
Get tool parser by name which is registered by `register_module`.
|
||||
|
||||
Raise a KeyError exception if the name is not registered.
|
||||
"""
|
||||
if name in cls.tool_parsers:
|
||||
return cls.tool_parsers[name]
|
||||
|
||||
raise KeyError(f"tool helper: '{name}' not found in tool_parsers")
|
||||
|
||||
@classmethod
|
||||
def _register_module(
|
||||
cls, module: type, module_name: Optional[Union[str, list[str]]] = None, force: bool = True
|
||||
) -> None:
|
||||
if not issubclass(module, ToolParser):
|
||||
raise TypeError(f"module must be subclass of ToolParser, but got {type(module)}")
|
||||
if module_name is None:
|
||||
module_name = module.__name__
|
||||
if isinstance(module_name, str):
|
||||
module_name = [module_name]
|
||||
for name in module_name:
|
||||
if not force and name in cls.tool_parsers:
|
||||
existed_module = cls.tool_parsers[name]
|
||||
raise KeyError(f"{name} is already registered " f"at {existed_module.__module__}")
|
||||
cls.tool_parsers[name] = module
|
||||
|
||||
@classmethod
|
||||
def register_module(
|
||||
cls, name: Optional[Union[str, list[str]]] = None, force: bool = True, module: Union[type, None] = None
|
||||
) -> Union[type, Callable]:
|
||||
"""
|
||||
Register module with the given name or name list. it can be used as a
|
||||
decoder(with module as None) or normal function(with module as not
|
||||
None).
|
||||
"""
|
||||
if not isinstance(force, bool):
|
||||
raise TypeError(f"force must be a boolean, but got {type(force)}")
|
||||
|
||||
# raise the error ahead of time
|
||||
if not (name is None or isinstance(name, str) or is_list_of(name, str)):
|
||||
raise TypeError("name must be None, an instance of str, or a sequence of str, " f"but got {type(name)}")
|
||||
|
||||
# use it as a normal method: x.register_module(module=SomeClass)
|
||||
if module is not None:
|
||||
cls._register_module(module=module, module_name=name, force=force)
|
||||
return module
|
||||
|
||||
# use it as a decorator: @x.register_module()
|
||||
def _register(module):
|
||||
cls._register_module(module=module, module_name=name, force=force)
|
||||
return module
|
||||
|
||||
return _register
|
||||
|
||||
@classmethod
|
||||
def import_tool_parser(cls, plugin_path: str) -> None:
|
||||
"""
|
||||
Import a user-defined tool parser by the path of the tool parser define
|
||||
file.
|
||||
"""
|
||||
module_name = os.path.splitext(os.path.basename(plugin_path))[0]
|
||||
|
||||
try:
|
||||
import_from_path(module_name, plugin_path)
|
||||
except Exception:
|
||||
data_processor_logger.exception("Failed to load module '%s' from %s.", module_name, plugin_path)
|
||||
return
|
@@ -0,0 +1,347 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import re
|
||||
import uuid
|
||||
from collections.abc import Sequence
|
||||
from typing import Union
|
||||
|
||||
import partial_json_parser
|
||||
|
||||
|
||||
def random_tool_call_id() -> str:
|
||||
"""Generate a random tool call ID"""
|
||||
return f"chatcmpl-tool-{str(uuid.uuid4().hex)}"
|
||||
|
||||
|
||||
from fastdeploy.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
DeltaFunctionCall,
|
||||
DeltaMessage,
|
||||
DeltaToolCall,
|
||||
ExtractedToolCallInformation,
|
||||
FunctionCall,
|
||||
ToolCall,
|
||||
)
|
||||
from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
||||
ToolParser,
|
||||
ToolParserManager,
|
||||
)
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
|
||||
@ToolParserManager.register_module("ernie_x1")
|
||||
class ErnieX1ToolParser(ToolParser):
|
||||
"""
|
||||
Tool parser for Ernie model version 4.5.1.
|
||||
This parser handles tool calls with newline formats.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer):
|
||||
super().__init__(tokenizer)
|
||||
|
||||
self.prev_tool_call_arr: list[dict] = []
|
||||
self.current_tool_id: int = -1
|
||||
self.current_tool_name_sent: bool = False
|
||||
self.streamed_args_for_tool: list[str] = [] # map what has been streamed for each tool so far to a list
|
||||
self.buffer: str = "" # buffer for accumulating unprocessed streaming content
|
||||
self.bracket_counts: dict = {"total_l": 0, "total_r": 0} # track bracket counts in streamed deltas
|
||||
self.tool_call_start_token: str = "<tool_call>"
|
||||
self.tool_call_end_token: str = "</tool_call>"
|
||||
|
||||
self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
|
||||
self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
|
||||
if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
|
||||
raise RuntimeError(
|
||||
"Hermes 2 Pro Tool parser could not locate tool call start/end " "tokens in the tokenizer!"
|
||||
)
|
||||
|
||||
if not self.model_tokenizer:
|
||||
raise ValueError(
|
||||
"The model tokenizer must be passed to the ToolCallParser constructor during construction."
|
||||
)
|
||||
|
||||
def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation:
|
||||
"""
|
||||
Extract the tool calls from a complete model response.
|
||||
Supports XML-style formats with newlines:
|
||||
- XML format: <think>\n...\n</think>\n\n\n<tool_call>\n{...}\n</tool_call>\n...
|
||||
|
||||
Handles boundary cases:
|
||||
1. Only name and partial arguments: {"name": "get_weather", "arguments": {"location": "北京"
|
||||
2. Only partial name: {"name": "get_we
|
||||
3. Only name and arguments field without content: {"name": "get_weather", "argume
|
||||
"""
|
||||
|
||||
try:
|
||||
tool_calls = []
|
||||
|
||||
# Check for invalid <response> tags before tool calls
|
||||
if re.search(r"<response>[\s\S]*?</response>\s*(?=<tool_call>)", model_output):
|
||||
data_processor_logger.error("Invalid format: <response> tags found before <tool_call>")
|
||||
return ExtractedToolCallInformation(tools_called=False, content=model_output)
|
||||
|
||||
function_call_arr = []
|
||||
remaining_text = model_output
|
||||
|
||||
while True:
|
||||
# 查找下一个tool_call块
|
||||
tool_call_pos = remaining_text.find("<tool_call>")
|
||||
if tool_call_pos == -1:
|
||||
break
|
||||
|
||||
# 提取tool_call开始位置后的内容
|
||||
tool_content_start = tool_call_pos + len("<tool_call>")
|
||||
tool_content_end = remaining_text.find("</tool_call>", tool_content_start)
|
||||
|
||||
tool_json = ""
|
||||
if tool_content_end == -1:
|
||||
# 处理未闭合的tool_call块(截断情况)
|
||||
tool_json = remaining_text[tool_content_start:].strip()
|
||||
remaining_text = "" # 没有更多内容需要处理
|
||||
else:
|
||||
# 处理完整的tool_call块
|
||||
tool_json = remaining_text[tool_content_start:tool_content_end].strip()
|
||||
remaining_text = remaining_text[tool_content_end + len("</tool_call>") :]
|
||||
|
||||
if not tool_json:
|
||||
continue
|
||||
|
||||
# 处理JSON内容
|
||||
tool_json = tool_json.strip()
|
||||
if not tool_json.startswith("{"):
|
||||
tool_json = "{" + tool_json
|
||||
if not tool_json.endswith("}"):
|
||||
tool_json = tool_json + "}"
|
||||
|
||||
try:
|
||||
# 首先尝试标准JSON解析
|
||||
try:
|
||||
tool_data = json.loads(tool_json)
|
||||
|
||||
if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data:
|
||||
function_call_arr.append(
|
||||
{
|
||||
"name": tool_data["name"],
|
||||
"arguments": tool_data["arguments"],
|
||||
"_is_complete": True, # 明确标记为完整解析
|
||||
}
|
||||
)
|
||||
continue
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 标准解析失败时尝试partial_json_parser
|
||||
from partial_json_parser.core.options import Allow
|
||||
|
||||
try:
|
||||
tool_data = {}
|
||||
flags = Allow.ALL & ~Allow.STR
|
||||
|
||||
# 解析name字段
|
||||
name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json)
|
||||
if name_match:
|
||||
tool_data["name"] = name_match.group(1)
|
||||
|
||||
# 解析arguments字段
|
||||
args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json)
|
||||
if args_match:
|
||||
try:
|
||||
tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags)
|
||||
except:
|
||||
tool_data["arguments"] = None
|
||||
|
||||
if isinstance(tool_data, dict):
|
||||
function_call_arr.append(
|
||||
{
|
||||
"name": tool_data.get("name", ""),
|
||||
"arguments": tool_data.get("arguments", {}),
|
||||
"_is_partial": True, # 标记为部分解析
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
|
||||
continue
|
||||
except Exception as e:
|
||||
data_processor_logger.debug(f"Failed to parse tool call: {str(e)}")
|
||||
continue
|
||||
|
||||
if not function_call_arr:
|
||||
data_processor_logger.error("No valid tool calls found")
|
||||
return ExtractedToolCallInformation(tools_called=False, content=model_output)
|
||||
|
||||
tool_calls = []
|
||||
all_complete = True # 初始设为True,只要有一个不完整就变为False
|
||||
|
||||
for tool_call in function_call_arr:
|
||||
# 记录工具调用解析状态
|
||||
is_complete = tool_call.get("_is_complete", False)
|
||||
is_partial = tool_call.get("_is_partial", False)
|
||||
|
||||
# 只要有一个不完整就认为整体不完整
|
||||
if not is_complete or is_partial:
|
||||
all_complete = False
|
||||
|
||||
# 处理参数序列化
|
||||
tool_args = tool_call.get("arguments", {})
|
||||
if not isinstance(tool_args, dict):
|
||||
tool_args = {}
|
||||
|
||||
try:
|
||||
args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}"
|
||||
except:
|
||||
args_str = "{}"
|
||||
|
||||
tool_calls.append(
|
||||
ToolCall(
|
||||
type="function",
|
||||
id=random_tool_call_id(),
|
||||
function=FunctionCall(
|
||||
name=tool_call.get("name", ""),
|
||||
arguments=args_str,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# 只有当所有工具调用都明确标记为complete时才返回tools_called=True
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content=""
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
|
||||
return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
|
||||
|
||||
def extract_tool_calls_streaming(
|
||||
self,
|
||||
previous_text: str,
|
||||
current_text: str,
|
||||
delta_text: str,
|
||||
previous_token_ids: Sequence[int],
|
||||
current_token_ids: Sequence[int],
|
||||
delta_token_ids: Sequence[int],
|
||||
request: dict,
|
||||
) -> Union[DeltaMessage, None]:
|
||||
|
||||
if self.tool_call_start_token_id not in current_token_ids:
|
||||
return DeltaMessage(content=delta_text)
|
||||
# 忽略空chunk
|
||||
if len(delta_text.strip()) == 0:
|
||||
return None
|
||||
|
||||
try:
|
||||
delta = None
|
||||
# 使用buffer累积delta_text内容
|
||||
self.buffer += delta_text
|
||||
|
||||
# 处理增量中的新tool_call开始
|
||||
if "<tool_call>" in delta_text:
|
||||
self.current_tool_id = (
|
||||
max(self.current_tool_id, 0) if self.current_tool_id == -1 else self.current_tool_id + 1
|
||||
)
|
||||
self.current_tool_name_sent = False
|
||||
if len(self.streamed_args_for_tool) <= self.current_tool_id:
|
||||
self.streamed_args_for_tool.append("")
|
||||
data_processor_logger.debug(f"New tool call started with ID: {self.current_tool_id}")
|
||||
|
||||
# 1. 尝试解析name字段
|
||||
if not self.current_tool_name_sent and '"name"' in self.buffer:
|
||||
name_match = re.search(r'"name"\s*:\s*"([^"]*)"', self.buffer)
|
||||
if name_match:
|
||||
name = name_match.group(1)
|
||||
if name:
|
||||
delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=self.current_tool_id,
|
||||
type="function",
|
||||
id=random_tool_call_id(),
|
||||
function=DeltaFunctionCall(name=name).model_dump(exclude_none=True),
|
||||
)
|
||||
]
|
||||
)
|
||||
# 删除已处理的name部分
|
||||
self.buffer = self.buffer[name_match.end() :]
|
||||
self.current_tool_name_sent = True
|
||||
return delta
|
||||
# 2. 尝试解析arguments字段
|
||||
if '"arguments"' in self.buffer:
|
||||
args_match = re.search(r'"arguments"\s*:\s*(\{.*)', self.buffer)
|
||||
if args_match:
|
||||
args_content = args_match.group(1)
|
||||
try:
|
||||
# 检查是否到达arguments结尾(括号完全匹配)
|
||||
if "}}" in args_content:
|
||||
# 逐个字符检查括号匹配状态
|
||||
matched_pos = -1
|
||||
for i, ch in enumerate(delta_text):
|
||||
if ch == "{":
|
||||
self.bracket_counts["total_l"] += 1
|
||||
elif ch == "}":
|
||||
self.bracket_counts["total_r"] += 1
|
||||
|
||||
if self.bracket_counts["total_l"] == self.bracket_counts["total_r"]: # 括号完全匹配
|
||||
matched_pos = i
|
||||
break
|
||||
|
||||
if matched_pos >= 0:
|
||||
# 找到匹配点,清理buffer并返回
|
||||
truncate_text = delta_text[: matched_pos + 1]
|
||||
delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=self.current_tool_id,
|
||||
function=DeltaFunctionCall(arguments=truncate_text).model_dump(
|
||||
exclude_none=True
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
self.buffer = self.buffer[args_match.end() :]
|
||||
return delta
|
||||
else:
|
||||
# 没有完全匹配,继续累积
|
||||
return None
|
||||
else:
|
||||
# 增量返回当前可解析的部分
|
||||
for ch in delta_text:
|
||||
if ch == "{":
|
||||
self.bracket_counts["total_l"] += 1
|
||||
elif ch == "}":
|
||||
self.bracket_counts["total_r"] += 1
|
||||
delta = DeltaMessage(
|
||||
tool_calls=[
|
||||
DeltaToolCall(
|
||||
index=self.current_tool_id,
|
||||
function=DeltaFunctionCall(arguments=delta_text).model_dump(exclude_none=True),
|
||||
)
|
||||
]
|
||||
)
|
||||
return delta
|
||||
except Exception as e:
|
||||
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
|
||||
return None
|
||||
if "</tool_call>" in self.buffer:
|
||||
end_pos = self.buffer.find("</tool_call>")
|
||||
self.buffer = self.buffer[end_pos + len("</tool_call>") :]
|
||||
|
||||
# 完成当前工具调用处理
|
||||
self.streamed_args_for_tool.append("")
|
||||
|
||||
return delta
|
||||
|
||||
except Exception as e:
|
||||
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
|
||||
return None
|
137
fastdeploy/entrypoints/openai/tool_parsers/utils.py
Normal file
137
fastdeploy/entrypoints/openai/tool_parsers/utils.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import json
|
||||
from json import JSONDecodeError, JSONDecoder
|
||||
from typing import Any
|
||||
|
||||
import partial_json_parser
|
||||
from partial_json_parser.core.options import Allow
|
||||
|
||||
|
||||
def find_common_prefix(s1: str, s2: str) -> str:
|
||||
"""
|
||||
Finds a common prefix that is shared between two strings, if there is one.
|
||||
Order of arguments is NOT important.
|
||||
|
||||
This function is provided as a UTILITY for extracting information from JSON
|
||||
generated by partial_json_parser, to help in ensuring that the right tokens
|
||||
are returned in streaming, so that close-quotes, close-brackets and
|
||||
close-braces are not returned prematurely.
|
||||
|
||||
e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') ->
|
||||
'{"fruit": "ap'
|
||||
"""
|
||||
prefix = ""
|
||||
min_length = min(len(s1), len(s2))
|
||||
for i in range(0, min_length):
|
||||
if s1[i] == s2[i]:
|
||||
prefix += s1[i]
|
||||
else:
|
||||
break
|
||||
return prefix
|
||||
|
||||
|
||||
def find_common_suffix(s1: str, s2: str) -> str:
|
||||
"""
|
||||
Finds a common suffix shared between two strings, if there is one. Order of
|
||||
arguments is NOT important.
|
||||
Stops when the suffix ends OR it hits an alphanumeric character
|
||||
|
||||
e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}'
|
||||
"""
|
||||
suffix = ""
|
||||
min_length = min(len(s1), len(s2))
|
||||
for i in range(1, min_length + 1):
|
||||
if s1[-i] == s2[-i] and not s1[-i].isalnum():
|
||||
suffix = s1[-i] + suffix
|
||||
else:
|
||||
break
|
||||
return suffix
|
||||
|
||||
|
||||
def extract_intermediate_diff(curr: str, old: str) -> str:
|
||||
"""
|
||||
Given two strings, extract the difference in the middle between two strings
|
||||
that are known to have a common prefix and/or suffix.
|
||||
|
||||
This function is provided as a UTILITY for extracting information from JSON
|
||||
generated by partial_json_parser, to help in ensuring that the right tokens
|
||||
are returned in streaming, so that close-quotes, close-brackets and
|
||||
close-braces are not returned prematurely. The order of arguments IS
|
||||
important - the new version of the partially-parsed JSON must be the first
|
||||
argument, and the secnod argument must be from the previous generation.
|
||||
|
||||
What it returns, is tokens that should be streamed to the client.
|
||||
|
||||
e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}')
|
||||
-> 'ple'
|
||||
|
||||
"""
|
||||
suffix = find_common_suffix(curr, old)
|
||||
|
||||
old = old[::-1].replace(suffix[::-1], "", 1)[::-1]
|
||||
prefix = find_common_prefix(curr, old)
|
||||
diff = curr
|
||||
if len(suffix):
|
||||
diff = diff[::-1].replace(suffix[::-1], "", 1)[::-1]
|
||||
|
||||
if len(prefix):
|
||||
# replace the prefix only once in case it's mirrored
|
||||
diff = diff.replace(prefix, "", 1)
|
||||
|
||||
return diff
|
||||
|
||||
|
||||
def find_all_indices(string: str, substring: str) -> list[int]:
|
||||
"""
|
||||
Find all (starting) indices of a substring in a given string. Useful for
|
||||
tool call extraction
|
||||
"""
|
||||
indices = []
|
||||
index = -1
|
||||
while True:
|
||||
index = string.find(substring, index + 1)
|
||||
if index == -1:
|
||||
break
|
||||
indices.append(index)
|
||||
return indices
|
||||
|
||||
|
||||
# partial_json_parser doesn't support extra data and
|
||||
# JSONDecoder.raw_decode doesn't support partial JSON
|
||||
def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
|
||||
try:
|
||||
return (partial_json_parser.loads(input_str, flags), len(input_str))
|
||||
except JSONDecodeError as e:
|
||||
if "Extra data" in e.msg:
|
||||
dec = JSONDecoder()
|
||||
return dec.raw_decode(input_str)
|
||||
raise
|
||||
|
||||
|
||||
def is_complete_json(input_str: str) -> bool:
|
||||
try:
|
||||
json.loads(input_str)
|
||||
return True
|
||||
except JSONDecodeError:
|
||||
return False
|
||||
|
||||
|
||||
def consume_space(i: int, s: str) -> int:
|
||||
while i < len(s) and s[i].isspace():
|
||||
i += 1
|
||||
return i
|
@@ -44,7 +44,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Whether to use HuggingFace tokenizer.
|
||||
"FD_USE_HF_TOKENIZER": lambda: os.getenv("FD_USE_HF_TOKENIZER", 0),
|
||||
# Set the high watermark (HWM) for receiving data during ZMQ initialization
|
||||
"FD_ZMQ_SNDHWM": lambda: os.getenv("FD_ZMQ_SNDHWM", 10000),
|
||||
"FD_ZMQ_SNDHWM": lambda: os.getenv("FD_ZMQ_SNDHWM", 64000),
|
||||
# cache kv quant params directory
|
||||
"FD_CACHE_PARAMS": lambda: os.getenv("FD_CACHE_PARAMS", "none"),
|
||||
# Set attention backend. "NATIVE_ATTN", "APPEND_ATTN"
|
||||
@@ -80,6 +80,18 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
|
||||
# enable kv cache block scheduler v1 (no need for kv_cache_ratio)
|
||||
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")),
|
||||
# set trace attribute job_id.
|
||||
"FD_JOB_ID": lambda: os.getenv("FD_JOB_ID"),
|
||||
# support max connections
|
||||
"FD_SUPPORT_MAX_CONNECTIONS": lambda: 768,
|
||||
# enable internal module to access LLMEngine.
|
||||
"FD_ENABLE_INTERNAL_ADAPTER": lambda: int(os.getenv("FD_ENABLE_INTERNAL_ADAPTER", "0")),
|
||||
# LLMEngine recieve requests port, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
||||
"FD_ZMQ_RECV_REQUEST_SERVER_PORT": lambda: os.getenv("FD_ZMQ_RECV_REQUEST_SERVER_PORT", "8200"),
|
||||
# LLMEngine send response port, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
||||
"FD_ZMQ_SEND_RESPONSE_SERVER_PORT": lambda: os.getenv("FD_ZMQ_SEND_RESPONSE_SERVER_PORT", "8201"),
|
||||
# LLMEngine recieve control command port, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
||||
"FD_ZMQ_CONTROL_CMD_SERVER_PORTS": lambda: os.getenv("FD_ZMQ_CONTROL_CMD_SERVER_PORTS", "8202"),
|
||||
}
|
||||
|
||||
|
||||
|
@@ -43,13 +43,14 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
pad_token_id (int): 存储填充符号的token ID。
|
||||
"""
|
||||
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||
self._init_config()
|
||||
|
||||
self.decode_status = dict()
|
||||
self.tool_parser_dict = dict()
|
||||
self.thinking_parser_dict = dict()
|
||||
self._load_tokenizer()
|
||||
data_processor_logger.info(
|
||||
@@ -61,6 +62,7 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.reasoning_parser = None
|
||||
self.tool_parser_obj = tool_parser_obj
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
|
||||
@@ -108,7 +110,16 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
request.prompt_token_ids = token_ids
|
||||
data_processor_logger.info(f"req_id:{request.request_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||
else:
|
||||
request.prompt_token_ids = self.messages2ids(request.to_dict())
|
||||
task = request.to_dict()
|
||||
chat_template_kwargs = kwargs.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in task:
|
||||
task[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
request.prompt_token_ids = self.messages2ids(task)
|
||||
|
||||
if len(request.prompt_token_ids) == 0:
|
||||
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
||||
@@ -124,6 +135,8 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
request.set("temperature", 1)
|
||||
if request.get("top_p") < _SAMPLING_EPS:
|
||||
request.set("top_p", _SAMPLING_EPS)
|
||||
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
|
||||
request.enable_thinking = True
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
return request
|
||||
|
||||
@@ -156,13 +169,21 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
if request.get("prompt"):
|
||||
prompt = request.get("prompt")
|
||||
prompt = prompt[0] if isinstance(prompt, list) else prompt
|
||||
|
||||
request["text_after_process"] = prompt
|
||||
tokens = self.tokenizer.tokenize(prompt)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
request["prompt_token_ids"] = token_ids
|
||||
req_id = request.get("request_id", None)
|
||||
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||
else:
|
||||
chat_template_kwargs = request.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in request:
|
||||
request[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
request["prompt_token_ids"] = self.messages2ids(request)
|
||||
if len(request["prompt_token_ids"]) == 0:
|
||||
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
||||
@@ -177,6 +198,8 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
request["temperature"] = 1
|
||||
if request.get("top_p") < _SAMPLING_EPS:
|
||||
request["top_p"] = _SAMPLING_EPS
|
||||
if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
|
||||
request["enable_thinking"] = True
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
|
||||
return request
|
||||
@@ -204,6 +227,12 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
response_dict.outputs.reasoning_content = reasoning_content
|
||||
else:
|
||||
response_dict.outputs.text = full_text
|
||||
if self.tool_parser_obj:
|
||||
tool_parser = self.tool_parser_obj(self.tokenizer)
|
||||
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
|
||||
if tool_call_info.tools_called:
|
||||
response_dict.outputs.tool_calls = tool_call_info.tool_calls
|
||||
response_dict.outputs.text = tool_call_info.content
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
|
||||
return None
|
||||
@@ -244,12 +273,21 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
if is_end:
|
||||
full_text = previous_texts + delta_text
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
if self.reasoning_parser and (
|
||||
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
|
||||
):
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
if self.tool_parser_obj:
|
||||
tool_parser = self.tool_parser_obj(self.tokenizer)
|
||||
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
|
||||
if tool_call_info.tools_called:
|
||||
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
|
||||
response_dict["outputs"]["text"] = tool_call_info.content
|
||||
response_dict["outputs"]["raw_prediction"] = full_text
|
||||
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
@@ -273,8 +311,11 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
response_dict["outputs"]["raw_prediction"] = delta_text
|
||||
if self.reasoning_parser and (
|
||||
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
|
||||
):
|
||||
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_texts,
|
||||
previous_texts + delta_text,
|
||||
delta_text,
|
||||
@@ -282,13 +323,28 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
previous_token_ids + token_ids,
|
||||
token_ids,
|
||||
)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
response_dict["outputs"]["delta_message"] = reasoning_delta_message
|
||||
if self.tool_parser_obj:
|
||||
if req_id not in self.tool_parser_dict:
|
||||
self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
|
||||
tool_parser = self.tool_parser_dict[req_id]
|
||||
tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
|
||||
previous_texts,
|
||||
previous_texts + delta_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
previous_token_ids + token_ids,
|
||||
token_ids,
|
||||
response_dict,
|
||||
)
|
||||
if tool_call_delta_message is None or tool_call_delta_message.tool_calls:
|
||||
response_dict["outputs"]["delta_message"] = tool_call_delta_message
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
if is_end:
|
||||
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
|
||||
del self.decode_status[req_id]
|
||||
if req_id in self.tool_parser_dict:
|
||||
del self.tool_parser_dict[req_id]
|
||||
return response_dict
|
||||
|
||||
def messages2ids(self, request_or_messages):
|
||||
@@ -310,7 +366,7 @@ class ErnieProcessor(BaseDataProcessor):
|
||||
split_special_tokens=False,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
request_or_messages["text_after_process"] = spliced_message
|
||||
req_id = None
|
||||
if isinstance(request_or_messages, dict):
|
||||
req_id = request_or_messages.get("request_id", None)
|
||||
|
@@ -14,8 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# cipher_token=WjI1fQOvhN # do not edit this line
|
||||
|
||||
import os
|
||||
import re
|
||||
from shutil import copyfile
|
||||
|
@@ -34,6 +34,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
limit_mm_per_prompt=None,
|
||||
mm_processor_kwargs=None,
|
||||
reasoning_parser_obj=None,
|
||||
tool_parser_obj=None,
|
||||
):
|
||||
self.use_hf_tokenizer = False
|
||||
|
||||
@@ -53,6 +54,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
self.image_patch_id = self.ernie_processor.image_patch_id
|
||||
self.spatial_conv_size = self.ernie_processor.spatial_conv_size
|
||||
|
||||
self.tool_parser_dict = dict()
|
||||
self.decode_status = dict()
|
||||
self._load_tokenizer()
|
||||
self.eos_token_ids = [self.tokenizer.eos_token_id]
|
||||
@@ -60,6 +62,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
||||
self.reasoning_parser = None
|
||||
self.tool_parser_obj = tool_parser_obj
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
|
||||
@@ -109,7 +112,7 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
def process_request(self, request, max_model_len=None, **kwargs):
|
||||
"""process the input data"""
|
||||
task = request.to_dict()
|
||||
task["enable_thinking"] = kwargs.get("enable_thinking", True)
|
||||
task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs")
|
||||
self.process_request_dict(task, max_model_len)
|
||||
request = Request.from_dict(task)
|
||||
request = self._apply_default_parameters(request)
|
||||
@@ -211,10 +214,20 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
self._check_mm_limits(multimodal_data)
|
||||
images = multimodal_data.get("image", None)
|
||||
videos = multimodal_data.get("video", None)
|
||||
request["text_after_process"] = request.get("prompt")
|
||||
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos)
|
||||
elif request.get("messages"):
|
||||
messages = request["messages"]
|
||||
self._check_mm_limits(messages)
|
||||
chat_template_kwargs = request.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in request:
|
||||
request[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
request.setdefault("enable_thinking", True)
|
||||
outputs = self.ernie_processor.request2ids(request)
|
||||
else:
|
||||
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
||||
@@ -233,6 +246,10 @@ class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
|
||||
if request.get("max_tokens") is None:
|
||||
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
|
||||
else:
|
||||
request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"])
|
||||
if not request.get("reasoning_max_tokens"):
|
||||
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
|
||||
return request
|
||||
|
@@ -494,16 +494,15 @@ class DataProcessor:
|
||||
"""
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError("This model does not support chat_template.")
|
||||
|
||||
prompt_token_str = (
|
||||
self.tokenizer.apply_chat_template(
|
||||
request,
|
||||
tokenize=False,
|
||||
add_generation_prompt=request.get("add_generation_prompt", True),
|
||||
)
|
||||
.replace("<|image@placeholder|>", "")
|
||||
.replace("<|video@placeholder|>", "")
|
||||
prompt_token_template = self.tokenizer.apply_chat_template(
|
||||
request,
|
||||
tokenize=False,
|
||||
add_generation_prompt=request.get("add_generation_prompt", True),
|
||||
)
|
||||
prompt_token_str = prompt_token_template.replace("<|image@placeholder|>", "").replace(
|
||||
"<|video@placeholder|>", ""
|
||||
)
|
||||
request["text_after_process"] = prompt_token_template
|
||||
tokens = self.tokenizer.tokenize(prompt_token_str)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
data_processor_logger.info(
|
||||
|
@@ -88,9 +88,6 @@ def process_image_data(image_data, mime_type, url):
|
||||
|
||||
def http_to_pil_image(url):
|
||||
"""http_to_pil_image"""
|
||||
if is_public_url(url) and int(os.getenv("DOWNLOAD_WITH_TP_SERVER", "0")):
|
||||
return http_to_pil_image_with_tp_server(url)
|
||||
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
raise Exception("Failed to download the image from URL.")
|
||||
@@ -106,60 +103,6 @@ def http_to_pil_image(url):
|
||||
return pil_image
|
||||
|
||||
|
||||
def http_to_pil_image_with_tp_server(url, retry_time=6):
|
||||
"""cnap平台没有外网访问权限,需要使用tp服务下载图片"""
|
||||
proxies = [
|
||||
{"http": "http://10.229.197.142:8807"},
|
||||
{"http": "http://10.229.197.161:8804"},
|
||||
{"http": "http://10.229.198.143:8804"},
|
||||
{"http": "http://10.122.108.164:8807"},
|
||||
{"http": "http://10.122.108.165:8807"},
|
||||
{"http": "http://10.122.108.166:8807"},
|
||||
{"http": "http://10.122.108.168:8801"},
|
||||
{"http": "http://10.122.150.146:8802"},
|
||||
{"http": "http://10.122.150.158:8802"},
|
||||
{"http": "http://10.122.150.164:8801"},
|
||||
{"http": "http://10.143.51.38:8813"},
|
||||
{"http": "http://10.143.103.42:8810"},
|
||||
{"http": "http://10.143.194.45:8804"},
|
||||
{"http": "http://10.143.226.25:8801"},
|
||||
{"http": "http://10.143.236.12:8807"},
|
||||
{"http": "http://10.143.238.36:8807"},
|
||||
{"http": "http://10.144.71.30:8807"},
|
||||
{"http": "http://10.144.73.16:8804"},
|
||||
{"http": "http://10.144.138.36:8801"},
|
||||
{"http": "http://10.144.152.40:8810"},
|
||||
{"http": "http://10.144.199.29:8810"},
|
||||
{"http": "http://10.144.251.29:8813"},
|
||||
]
|
||||
headers = {
|
||||
"X-Tp-Authorization": "Basic RVJOSUVMaXRlVjpFUk5JRUxpdGVWXzFxYXo0cmZ2M2VkYzV0Z2Iyd3N4LWJmZS10cA==",
|
||||
"scheme": "https",
|
||||
}
|
||||
|
||||
new_url = url.replace("https://", "http://") if url.startswith("https://") else url
|
||||
|
||||
# 代理可能不稳定,需要重试
|
||||
for idx in range(retry_time):
|
||||
try:
|
||||
response = requests.get(new_url, headers=headers, proxies=random.choice(proxies))
|
||||
if response.status_code == 200:
|
||||
image_data = io.BytesIO(response.content)
|
||||
|
||||
mime_type = response.headers.get("Content-Type")
|
||||
if mime_type is None:
|
||||
mime_type, _ = mimetypes.guess_type(url)
|
||||
|
||||
data_processor_logger.info(f"Detected MIME type: {mime_type}") # 调试信息
|
||||
pil_image = process_image_data(image_data, mime_type, url)
|
||||
|
||||
return pil_image
|
||||
except Exception as e:
|
||||
data_processor_logger.error(f"Failed to download the image, idx: {idx}, URL: {url}, error: {e}")
|
||||
|
||||
raise Exception(f"Failed to download the image from URL: {url}")
|
||||
|
||||
|
||||
def base64_to_pil_image(base64_string):
|
||||
"""base64_to_pil_image"""
|
||||
image_bytes = base64.b64decode(base64_string)
|
||||
|
@@ -18,6 +18,7 @@ from typing import Any, Dict, Optional
|
||||
|
||||
from fastdeploy.config import ErnieArchitectures
|
||||
from fastdeploy.engine.config import ModelConfig
|
||||
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from fastdeploy.reasoning import ReasoningParserManager
|
||||
|
||||
|
||||
@@ -48,6 +49,7 @@ class InputPreprocessor:
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
enable_mm: bool = False,
|
||||
tool_parser: str = None,
|
||||
) -> None:
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
@@ -55,6 +57,7 @@ class InputPreprocessor:
|
||||
self.enable_mm = enable_mm
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
self.tool_parser = tool_parser
|
||||
|
||||
def create_processor(self):
|
||||
"""
|
||||
@@ -68,8 +71,11 @@ class InputPreprocessor:
|
||||
DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
|
||||
"""
|
||||
reasoning_parser_obj = None
|
||||
tool_parser_obj = None
|
||||
if self.reasoning_parser:
|
||||
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
|
||||
if self.tool_parser:
|
||||
tool_parser_obj = ToolParserManager.get_tool_parser(self.tool_parser)
|
||||
architectures = ModelConfig({"model": self.model_name_or_path}).architectures[0]
|
||||
if not self.enable_mm:
|
||||
if not ErnieArchitectures.contains_ernie_arch(architectures):
|
||||
@@ -78,6 +84,7 @@ class InputPreprocessor:
|
||||
self.processor = DataProcessor(
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
)
|
||||
else:
|
||||
from fastdeploy.input.ernie_processor import ErnieProcessor
|
||||
@@ -85,6 +92,7 @@ class InputPreprocessor:
|
||||
self.processor = ErnieProcessor(
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
)
|
||||
else:
|
||||
if not architectures.startswith("Ernie4_5_VLMoeForConditionalGeneration"):
|
||||
@@ -97,5 +105,6 @@ class InputPreprocessor:
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
)
|
||||
return self.processor
|
||||
|
@@ -148,7 +148,7 @@ class BaseDataProcessor(ABC):
|
||||
|
||||
|
||||
class DataProcessor(BaseDataProcessor):
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
|
||||
"""
|
||||
Initializes the DecodeStatus object.
|
||||
|
||||
@@ -168,6 +168,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
self._init_config()
|
||||
|
||||
self.decode_status = dict()
|
||||
self.tool_parser_dict = dict()
|
||||
self.tokenizer = self._load_tokenizer()
|
||||
data_processor_logger.info(
|
||||
f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
|
||||
@@ -180,6 +181,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.reasoning_parser = None
|
||||
self.tool_parser_obj = tool_parser_obj
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
self.tokenizer.pad_token_id = self.pad_token_id
|
||||
@@ -222,7 +224,6 @@ class DataProcessor(BaseDataProcessor):
|
||||
request = self._apply_default_parameters(request)
|
||||
if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
|
||||
request.eos_token_ids = self.eos_token_ids
|
||||
|
||||
stop_sequences = request.get("stop", [])
|
||||
if stop_sequences is not None and len(stop_sequences) != 0:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
@@ -236,7 +237,15 @@ class DataProcessor(BaseDataProcessor):
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError("This model does not support chat_template.")
|
||||
task = request.to_dict()
|
||||
task["enable_thinking"] = kwargs.get("enable_thinking", True)
|
||||
chat_template_kwargs = kwargs.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in task:
|
||||
task[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
task.setdefault("enable_thinking", True)
|
||||
request.prompt_token_ids = self.messages2ids(task)
|
||||
else:
|
||||
raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
|
||||
@@ -281,10 +290,20 @@ class DataProcessor(BaseDataProcessor):
|
||||
# processing prompt_token_ids
|
||||
if not request.get("prompt_token_ids"):
|
||||
if "prompt" in request:
|
||||
request["text_after_process"] = request["prompt"]
|
||||
request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist()
|
||||
elif "messages" in request:
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError("This model does not support chat_template.")
|
||||
chat_template_kwargs = request.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in request:
|
||||
request[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
request.setdefault("enable_thinking", True)
|
||||
request["prompt_token_ids"] = self.messages2ids(request)
|
||||
else:
|
||||
raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
|
||||
@@ -328,6 +347,12 @@ class DataProcessor(BaseDataProcessor):
|
||||
else:
|
||||
# 模型不支持思考,并且没单独设置enable_thinking为false
|
||||
response_dict.outputs.text = full_text
|
||||
if self.tool_parser_obj:
|
||||
tool_parser = self.tool_parser_obj(self.tokenizer)
|
||||
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
|
||||
if tool_call_info.tools_called:
|
||||
response_dict.outputs.tool_calls = tool_call_info.tool_calls
|
||||
response_dict.outputs.text = tool_call_info.content
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
|
||||
return response_dict
|
||||
@@ -352,12 +377,19 @@ class DataProcessor(BaseDataProcessor):
|
||||
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
if is_end:
|
||||
full_text = previous_texts + delta_text
|
||||
response_dict["outputs"]["raw_prediction"] = full_text
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
if self.tool_parser_obj:
|
||||
tool_parser = self.tool_parser_obj(self.tokenizer)
|
||||
tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
|
||||
if tool_call_info.tools_called:
|
||||
response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls
|
||||
response_dict["outputs"]["text"] = tool_call_info.content
|
||||
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
@@ -381,9 +413,11 @@ class DataProcessor(BaseDataProcessor):
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
response_dict["outputs"]["raw_prediction"] = delta_text
|
||||
if self.reasoning_parser and (
|
||||
enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
|
||||
):
|
||||
reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_texts,
|
||||
previous_texts + delta_text,
|
||||
delta_text,
|
||||
@@ -391,13 +425,28 @@ class DataProcessor(BaseDataProcessor):
|
||||
previous_token_ids + token_ids,
|
||||
token_ids,
|
||||
)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
response_dict["outputs"]["delta_message"] = reasoning_delta_message
|
||||
if self.tool_parser_obj:
|
||||
if req_id not in self.tool_parser_dict:
|
||||
self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
|
||||
tool_parser = self.tool_parser_dict[req_id]
|
||||
tool_call = tool_parser.extract_tool_calls_streaming(
|
||||
previous_texts,
|
||||
previous_texts + delta_text,
|
||||
delta_text,
|
||||
previous_token_ids,
|
||||
previous_token_ids + token_ids,
|
||||
token_ids,
|
||||
response_dict,
|
||||
)
|
||||
if tool_call is None or tool_call.tool_calls:
|
||||
response_dict["outputs"]["delta_message"] = tool_call
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
if is_end:
|
||||
data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
|
||||
del self.decode_status[req_id]
|
||||
if req_id in self.tool_parser_dict:
|
||||
del self.tool_parser_dict[req_id]
|
||||
return response_dict
|
||||
|
||||
def process_response_dict(self, response_dict, **kwargs):
|
||||
@@ -472,6 +521,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
add_special_tokens=False,
|
||||
return_tensors="pd",
|
||||
)
|
||||
request["text_after_process"] = spliced_message
|
||||
req_id = None
|
||||
tokens = self.tokenizer.tokenize(spliced_message)
|
||||
if isinstance(request, dict):
|
||||
|
@@ -17,6 +17,7 @@
|
||||
from .engine_cache_queue import EngineCacheQueue
|
||||
from .engine_worker_queue import EngineWorkerQueue
|
||||
from .ipc_signal import IPCSignal
|
||||
from .zmq_client import ZmqClient
|
||||
from .zmq_client import ZmqIpcClient
|
||||
from .zmq_server import ZmqIpcServer, ZmqTcpServer
|
||||
|
||||
__all__ = ["ZmqClient", "IPCSignal", "EngineWorkerQueue", "EngineCacheQueue"]
|
||||
__all__ = ["ZmqIpcClient", "IPCSignal", "EngineWorkerQueue", "EngineCacheQueue", "ZmqTcpServer", "ZmqIpcServer"]
|
||||
|
@@ -14,200 +14,100 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import msgpack
|
||||
import zmq
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
|
||||
class ZmqClient:
|
||||
class ZmqClientBase(ABC):
|
||||
"""
|
||||
ZmqClient is a class that provides a client-side interface for sending and receiving messages using ZeroMQ.
|
||||
ZmqClientBase is a base class that provides a client-side interface for sending and receiving messages using ZeroMQ.
|
||||
"""
|
||||
|
||||
def __init__(self, name, mode):
|
||||
self.context = zmq.Context()
|
||||
self.socket = self.context.socket(mode)
|
||||
self.file_name = f"/dev/shm/{name}.socket"
|
||||
self.router_path = f"/dev/shm/router_{name}.ipc"
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
self.ZMQ_SNDHWM = int(envs.FD_ZMQ_SNDHWM)
|
||||
self.aggregate_send = envs.FD_USE_AGGREGATE_SEND
|
||||
@abstractmethod
|
||||
def _create_socket(self):
|
||||
"""Abstract method to create and return a ZeroMQ socket."""
|
||||
pass
|
||||
|
||||
self.mutex = threading.Lock()
|
||||
self.req_dict = dict()
|
||||
self.router = None
|
||||
self.poller = None
|
||||
self.running = True
|
||||
def _ensure_socket(self):
|
||||
"""Ensure the socket is created before use."""
|
||||
if self.socket is None:
|
||||
self.socket = self._create_socket()
|
||||
|
||||
@abstractmethod
|
||||
def connect(self):
|
||||
"""
|
||||
Connect to the server using the file name specified in the constructor.
|
||||
"""
|
||||
self.socket.connect(f"ipc://{self.file_name}")
|
||||
|
||||
def start_server(self):
|
||||
"""
|
||||
Start the server using the file name specified in the constructor.
|
||||
"""
|
||||
self.socket.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM)
|
||||
self.socket.setsockopt(zmq.SNDTIMEO, -1)
|
||||
self.socket.bind(f"ipc://{self.file_name}")
|
||||
self.poller = zmq.Poller()
|
||||
self.poller.register(self.socket, zmq.POLLIN)
|
||||
|
||||
def create_router(self):
|
||||
"""
|
||||
Create a ROUTER socket and bind it to the specified router path.
|
||||
"""
|
||||
self.router = self.context.socket(zmq.ROUTER)
|
||||
self.router.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM)
|
||||
self.router.setsockopt(zmq.SNDTIMEO, -1)
|
||||
self.router.bind(f"ipc://{self.router_path}")
|
||||
pass
|
||||
|
||||
def send_json(self, data):
|
||||
"""
|
||||
Send a JSON-serializable object over the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
self.socket.send_json(data)
|
||||
|
||||
def recv_json(self):
|
||||
"""
|
||||
Receive a JSON-serializable object from the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
return self.socket.recv_json()
|
||||
|
||||
def send_pyobj(self, data):
|
||||
"""
|
||||
Send a Pickle-serializable object over the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
self.socket.send_pyobj(data)
|
||||
|
||||
def recv_pyobj(self):
|
||||
"""
|
||||
Receive a Pickle-serializable object from the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
return self.socket.recv_pyobj()
|
||||
|
||||
def pack_aggregated_data(self, data):
|
||||
"""
|
||||
Aggregate multiple responses into one and send them to the client.
|
||||
"""
|
||||
result = data[0]
|
||||
if len(data) > 1:
|
||||
for response in data[1:]:
|
||||
result.add(response)
|
||||
result = msgpack.packb([result.to_dict()])
|
||||
return result
|
||||
@abstractmethod
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def send_multipart(self, req_id, data):
|
||||
"""
|
||||
Send a multipart message to the router socket.
|
||||
"""
|
||||
if self.router is None:
|
||||
raise RuntimeError("Router socket not created. Call create_router() first.")
|
||||
|
||||
while self.running:
|
||||
with self.mutex:
|
||||
if req_id not in self.req_dict:
|
||||
try:
|
||||
client, _, request_id = self.router.recv_multipart(flags=zmq.NOBLOCK)
|
||||
req_id_str = request_id.decode("utf-8")
|
||||
self.req_dict[req_id_str] = client
|
||||
except zmq.Again:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
else:
|
||||
break
|
||||
class ZmqIpcClient(ZmqClientBase):
|
||||
def __init__(self, name, mode):
|
||||
self.name = name
|
||||
self.mode = mode
|
||||
self.file_name = f"/dev/shm/{name}.socket"
|
||||
self.context = zmq.Context()
|
||||
self.socket = self.context.socket(self.mode)
|
||||
|
||||
try:
|
||||
start_send = time.time()
|
||||
if self.aggregate_send:
|
||||
result = self.pack_aggregated_data(data)
|
||||
else:
|
||||
result = msgpack.packb([response.to_dict() for response in data])
|
||||
self.router.send_multipart([self.req_dict[req_id], b"", result])
|
||||
llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}")
|
||||
def _create_socket(self):
|
||||
"""create and return a ZeroMQ socket."""
|
||||
self.context = zmq.Context()
|
||||
return self.context.socket(self.mode)
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Send result to zmq client failed: {e}")
|
||||
|
||||
if data[-1].finished:
|
||||
with self.mutex:
|
||||
self.req_dict.pop(req_id, None)
|
||||
llm_logger.info(f"send_multipart finished, req_id: {req_id}")
|
||||
|
||||
def receive_json_once(self, block=False):
|
||||
"""
|
||||
Receive a single message from the socket.
|
||||
"""
|
||||
if self.socket is None or self.socket.closed:
|
||||
return "zmp socket has closed", None
|
||||
try:
|
||||
flags = zmq.NOBLOCK if not block else 0
|
||||
return None, self.socket.recv_json(flags=flags)
|
||||
except zmq.Again:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
self.close()
|
||||
llm_logger.warning(f"{e}")
|
||||
return str(e), None
|
||||
|
||||
def receive_pyobj_once(self, block=False):
|
||||
"""
|
||||
Receive a single message from the socket.
|
||||
"""
|
||||
if self.socket is None or self.socket.closed:
|
||||
return "zmp socket has closed", None
|
||||
try:
|
||||
flags = zmq.NOBLOCK if not block else 0
|
||||
return None, self.socket.recv_pyobj(flags=flags)
|
||||
except zmq.Again:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
self.close()
|
||||
llm_logger.warning(f"{e}")
|
||||
return str(e), None
|
||||
|
||||
def _clear_ipc(self, name):
|
||||
"""
|
||||
Remove the IPC file with the given name.
|
||||
"""
|
||||
if os.path.exists(name):
|
||||
try:
|
||||
os.remove(name)
|
||||
except OSError as e:
|
||||
llm_logger.warning(f"Failed to remove IPC file {name} - {e}")
|
||||
def connect(self):
|
||||
self._ensure_socket()
|
||||
self.socket.connect(f"ipc://{self.file_name}")
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the socket and context, and remove the IPC files.
|
||||
Close the socket and context.
|
||||
"""
|
||||
if not self.running:
|
||||
return
|
||||
|
||||
self.running = False
|
||||
llm_logger.info("Closing ZMQ connection...")
|
||||
llm_logger.info("ZMQ client is closing connection...")
|
||||
try:
|
||||
if hasattr(self, "socket") and not self.socket.closed:
|
||||
if self.socket is not None and not self.socket.closed:
|
||||
self.socket.setsockopt(zmq.LINGER, 0)
|
||||
self.socket.close()
|
||||
|
||||
if self.router is not None and not self.router.closed:
|
||||
self.router.close()
|
||||
|
||||
if not self.context.closed:
|
||||
if self.context is not None:
|
||||
self.context.term()
|
||||
|
||||
self._clear_ipc(self.file_name)
|
||||
self._clear_ipc(self.router_path)
|
||||
except Exception as e:
|
||||
llm_logger.warning(f"Failed to close ZMQ connection - {e}")
|
||||
llm_logger.warning(f"ZMQ client failed to close connection - {e}")
|
||||
return
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
308
fastdeploy/inter_communicator/zmq_server.py
Normal file
308
fastdeploy/inter_communicator/zmq_server.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import defaultdict
|
||||
|
||||
import msgpack
|
||||
import zmq
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
|
||||
class ZmqServerBase(ABC):
|
||||
"""
|
||||
ZmqServerBase
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.cached_results = defaultdict(list)
|
||||
self.response_token_lock = threading.Lock()
|
||||
|
||||
@abstractmethod
|
||||
def _create_socket(self):
|
||||
"""Abstract method to create and return a ZeroMQ socket."""
|
||||
pass
|
||||
|
||||
def _ensure_socket(self):
|
||||
"""Ensure the socket is created before use."""
|
||||
if self.socket is None:
|
||||
self.socket = self._create_socket()
|
||||
|
||||
def pack_aggregated_data(self, data):
|
||||
"""
|
||||
Aggregate multiple responses into one and send them to the client.
|
||||
"""
|
||||
result = data[0]
|
||||
if len(data) > 1:
|
||||
for response in data[1:]:
|
||||
result.add(response)
|
||||
result = msgpack.packb([result.to_dict()])
|
||||
return result
|
||||
|
||||
def receive_json_once(self, block=False):
|
||||
"""
|
||||
Receive a single message from the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
if self.socket is None or self.socket.closed:
|
||||
return "zmp socket has closed", None
|
||||
try:
|
||||
flags = zmq.NOBLOCK if not block else 0
|
||||
return None, self.socket.recv_json(flags=flags)
|
||||
except zmq.Again:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
self.close()
|
||||
llm_logger.warning(f"{e}")
|
||||
return str(e), None
|
||||
|
||||
def receive_pyobj_once(self, block=False):
|
||||
"""
|
||||
Receive a single message from the socket.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
if self.socket is None or self.socket.closed:
|
||||
return "zmp socket has closed", None
|
||||
try:
|
||||
flags = zmq.NOBLOCK if not block else 0
|
||||
return None, self.socket.recv_pyobj(flags=flags)
|
||||
except zmq.Again:
|
||||
return None, None
|
||||
except Exception as e:
|
||||
self.close()
|
||||
llm_logger.warning(f"{e}")
|
||||
return str(e), None
|
||||
|
||||
def recv_result_handle(self):
|
||||
while True:
|
||||
try:
|
||||
with self.response_token_lock:
|
||||
client, _, request_id = self.socket.recv_multipart(flags=zmq.NOBLOCK)
|
||||
req_id_str = request_id.decode("utf-8")
|
||||
need_send_after_finished_inference = False
|
||||
with self.mutex:
|
||||
self.req_dict[req_id_str] = client
|
||||
if req_id_str in self.cached_results:
|
||||
if self.cached_results[req_id_str][-1][-1].finished:
|
||||
need_send_after_finished_inference = True
|
||||
if need_send_after_finished_inference:
|
||||
self.send_response(req_id_str, [])
|
||||
llm_logger.info(f"send_multipart finished, req_id: {req_id_str}")
|
||||
self.req_dict.pop(req_id_str, None)
|
||||
|
||||
except zmq.Again:
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
except Exception as e:
|
||||
llm_logger.error(f"recv_result_handle get unknown exception: {e}")
|
||||
continue
|
||||
|
||||
def send_response(self, req_id, data):
|
||||
"""
|
||||
Send generated token result to client.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
if self.socket is None:
|
||||
raise RuntimeError("Router socket not created. Call create_router() first.")
|
||||
new_data = []
|
||||
has_result_handle = False
|
||||
with self.mutex:
|
||||
if req_id not in self.req_dict:
|
||||
self.cached_results[req_id].append(data)
|
||||
else:
|
||||
has_result_handle = True
|
||||
if req_id in self.cached_results:
|
||||
for history_data in self.cached_results[req_id]:
|
||||
new_data.extend(history_data)
|
||||
llm_logger.info(
|
||||
f"get request {req_id} result handle after cached result, total cached length {len(self.cached_results[req_id])}"
|
||||
)
|
||||
del self.cached_results[req_id]
|
||||
if has_result_handle:
|
||||
try:
|
||||
new_data.extend(data)
|
||||
start_send = time.time()
|
||||
if self.aggregate_send:
|
||||
result = self.pack_aggregated_data(new_data)
|
||||
else:
|
||||
result = msgpack.packb([response.to_dict() for response in new_data])
|
||||
with self.response_token_lock:
|
||||
self.socket.send_multipart([self.req_dict[req_id], b"", result])
|
||||
llm_logger.debug(
|
||||
f"send_multipart result: {req_id} len {len(new_data)} elapse: {time.time()-start_send}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Send result to zmq client failed: {e}")
|
||||
|
||||
if data and data[-1].finished:
|
||||
with self.mutex:
|
||||
if req_id in self.req_dict:
|
||||
llm_logger.info(f"send_multipart finished, req_id: {req_id}")
|
||||
self.req_dict.pop(req_id, None)
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
|
||||
|
||||
class ZmqIpcServer(ZmqServerBase):
|
||||
"""
|
||||
ZmqIpcServer, used when FD_ENABLE_INTERNAL_ADAPTER=0
|
||||
"""
|
||||
|
||||
def __init__(self, name, mode):
|
||||
self.name = name
|
||||
self.mode = mode
|
||||
self.cached_results = defaultdict(list)
|
||||
if mode == zmq.PULL:
|
||||
self.file_name = f"/dev/shm/{name}.socket"
|
||||
elif mode == zmq.ROUTER:
|
||||
self.file_name = f"/dev/shm/router_{name}.ipc"
|
||||
self.ZMQ_SNDHWM = int(envs.FD_ZMQ_SNDHWM)
|
||||
self.aggregate_send = envs.FD_USE_AGGREGATE_SEND
|
||||
self.mutex = threading.Lock()
|
||||
self.response_token_lock = threading.Lock()
|
||||
self.req_dict = dict()
|
||||
self.running = True
|
||||
self.context = zmq.Context()
|
||||
self._create_socket()
|
||||
|
||||
def _create_socket(self):
|
||||
"""create and return a ZeroMQ socket."""
|
||||
self.socket = self.context.socket(self.mode)
|
||||
self.socket.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM)
|
||||
self.socket.setsockopt(zmq.SNDTIMEO, -1)
|
||||
self.socket.bind(f"ipc://{self.file_name}")
|
||||
return self.socket
|
||||
|
||||
def _clear_ipc(self, name):
|
||||
"""
|
||||
Remove the IPC file with the given name.
|
||||
"""
|
||||
if os.path.exists(name):
|
||||
try:
|
||||
os.remove(name)
|
||||
except OSError as e:
|
||||
llm_logger.warning(f"Failed to remove IPC file {name} - {e}")
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the socket and context, and remove the IPC files.
|
||||
"""
|
||||
if not self.running:
|
||||
return
|
||||
|
||||
self.running = False
|
||||
llm_logger.info("ZMQ server is closing connection...")
|
||||
try:
|
||||
if self.socket is not None and not self.socket.closed:
|
||||
self.socket.close()
|
||||
if not self.context.closed:
|
||||
self.context.term()
|
||||
self._clear_ipc(self.file_name)
|
||||
except Exception as e:
|
||||
llm_logger.warning(f"ZMQ server failed to close connection - {e}")
|
||||
return
|
||||
|
||||
|
||||
class ZmqTcpServer(ZmqServerBase):
|
||||
"""
|
||||
ZmqTcpServer, used when FD_ENABLE_INTERNAL_ADAPTER=1
|
||||
"""
|
||||
|
||||
def __init__(self, port, mode):
|
||||
self.mode = mode
|
||||
self.port = port
|
||||
self.cached_results = defaultdict(list)
|
||||
self.ZMQ_SNDHWM = int(envs.FD_ZMQ_SNDHWM)
|
||||
self.aggregate_send = envs.FD_USE_AGGREGATE_SEND
|
||||
|
||||
self.mutex = threading.Lock()
|
||||
self.req_dict = dict()
|
||||
self.running = True
|
||||
self.context = zmq.Context()
|
||||
self._create_socket()
|
||||
self.response_token_lock = threading.Lock()
|
||||
|
||||
def _create_socket(self):
|
||||
"""create and return a ZeroMQ socket."""
|
||||
self.socket = self.context.socket(self.mode)
|
||||
self.socket.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM)
|
||||
self.socket.setsockopt(zmq.SNDTIMEO, -1)
|
||||
self.socket.bind(f"tcp://*:{self.port}")
|
||||
return self.socket
|
||||
|
||||
def recv_control_cmd(self):
|
||||
"""
|
||||
Recieve control command from client
|
||||
"""
|
||||
self._ensure_socket()
|
||||
try:
|
||||
client, _, task_data = self.socket.recv_multipart(flags=zmq.NOBLOCK)
|
||||
task = msgpack.unpackb(task_data)
|
||||
task_id_str = task["task_id"]
|
||||
except zmq.Again:
|
||||
return None
|
||||
with self.mutex:
|
||||
self.req_dict[task_id_str] = client
|
||||
return task
|
||||
|
||||
def response_for_control_cmd(self, task_id, result):
|
||||
"""
|
||||
Send command result back to client.
|
||||
"""
|
||||
self._ensure_socket()
|
||||
if self.socket is None:
|
||||
raise RuntimeError("Router socket not created.")
|
||||
try:
|
||||
result = msgpack.packb(result)
|
||||
self.socket.send_multipart([self.req_dict[task_id], b"", result])
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Send result to zmq client failed: {e}")
|
||||
|
||||
with self.mutex:
|
||||
self.req_dict.pop(task_id, None)
|
||||
llm_logger.debug(f"response control cmd finished, task_id: {task_id}")
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the socket and context.
|
||||
"""
|
||||
if not self.running:
|
||||
return
|
||||
|
||||
self.running = False
|
||||
llm_logger.info("ZMQ server is closing connection...")
|
||||
try:
|
||||
if self.socket is not None and not self.socket.closed:
|
||||
self.socket.close()
|
||||
if not self.context.closed:
|
||||
self.context.term()
|
||||
|
||||
except Exception as e:
|
||||
llm_logger.warning(f"ZMQ server failed to close connection - {e}")
|
||||
return
|
@@ -154,6 +154,22 @@ class MetricsManager:
|
||||
spec_decode_num_emitted_tokens_total: "Counter"
|
||||
spec_decode_draft_single_head_acceptance_rate: "list[Gauge]"
|
||||
|
||||
# for YIYAN Adapter
|
||||
prefix_cache_token_num: "Gauge"
|
||||
prefix_gpu_cache_token_num: "Gauge"
|
||||
prefix_cpu_cache_token_num: "Gauge"
|
||||
prefix_ssd_cache_token_num: "Gauge"
|
||||
batch_size: "Gauge"
|
||||
max_batch_size: "Gauge"
|
||||
available_gpu_block_num: "Gauge"
|
||||
free_gpu_block_num: "Gauge"
|
||||
max_gpu_block_num: "Gauge"
|
||||
available_gpu_resource: "Gauge"
|
||||
requests_number: "Counter"
|
||||
send_cache_failed_num: "Counter"
|
||||
first_token_latency: "Gauge"
|
||||
infer_latency: "Gauge"
|
||||
|
||||
# 定义所有指标配置
|
||||
METRICS = {
|
||||
"num_requests_running": {
|
||||
@@ -258,6 +274,91 @@ class MetricsManager:
|
||||
"description": "Total number of successfully processed requests",
|
||||
"kwargs": {},
|
||||
},
|
||||
# for YIYAN Adapter
|
||||
"prefix_cache_token_num": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:prefix_cache_token_num",
|
||||
"description": "Total number of cached tokens",
|
||||
"kwargs": {},
|
||||
},
|
||||
"prefix_gpu_cache_token_num": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:prefix_gpu_cache_token_num",
|
||||
"description": "Total number of cached tokens on GPU",
|
||||
"kwargs": {},
|
||||
},
|
||||
"prefix_cpu_cache_token_num": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:prefix_cpu_cache_token_num",
|
||||
"description": "Total number of cached tokens on CPU",
|
||||
"kwargs": {},
|
||||
},
|
||||
"prefix_ssd_cache_token_num": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:prefix_ssd_cache_token_num",
|
||||
"description": "Total number of cached tokens on SSD",
|
||||
"kwargs": {},
|
||||
},
|
||||
"batch_size": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:batch_size",
|
||||
"description": "Real batch size during inference",
|
||||
"kwargs": {},
|
||||
},
|
||||
"max_batch_size": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:max_batch_size",
|
||||
"description": "Maximum batch size determined when service started",
|
||||
"kwargs": {},
|
||||
},
|
||||
"available_gpu_block_num": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:available_gpu_block_num",
|
||||
"description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released",
|
||||
"kwargs": {},
|
||||
},
|
||||
"free_gpu_block_num": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:free_gpu_block_num",
|
||||
"description": "Number of free blocks in cache",
|
||||
"kwargs": {},
|
||||
},
|
||||
"max_gpu_block_num": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:max_gpu_block_num",
|
||||
"description": "Number of total blocks determined when service started",
|
||||
"kwargs": {},
|
||||
},
|
||||
"available_gpu_resource": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:available_gpu_resource",
|
||||
"description": "Available blocks percentage, i.e. available_gpu_block_num / max_gpu_block_num",
|
||||
"kwargs": {},
|
||||
},
|
||||
"requests_number": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:requests_number",
|
||||
"description": "Total number of requests received",
|
||||
"kwargs": {},
|
||||
},
|
||||
"send_cache_failed_num": {
|
||||
"type": Counter,
|
||||
"name": "fastdeploy:send_cache_failed_num",
|
||||
"description": "Total number of failures of sending cache",
|
||||
"kwargs": {},
|
||||
},
|
||||
"first_token_latency": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:first_token_latency",
|
||||
"description": "Latest time to first token in seconds",
|
||||
"kwargs": {},
|
||||
},
|
||||
"infer_latency": {
|
||||
"type": Gauge,
|
||||
"name": "fastdeploy:infer_latency",
|
||||
"description": "Latest time to generate one token in seconds",
|
||||
"kwargs": {},
|
||||
},
|
||||
}
|
||||
SPECULATIVE_METRICS = {}
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user