mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[INTEL_HPU] [CI] enabled fastdeploy PR testing (#4596)
* [INTEL HPU] added hpu ci work flow support Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] added run ci hpu test scripts Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] enabled HPU ernie test case Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] updated Intel Gaudi Readme with Warmup disable cmdline Signed-off-by: Luo, Focus <focus.luo@intel.com> * Modify paddlepaddle installation command Updated paddlepaddle installation command to use a specific index URL. * Update run_ci_hpu.sh * Rename json directory to nlohmann_json Rename extracted json directory to nlohmann_json. * Update ci_hpu.yml * Set pip global index URL to Tsinghua mirror * Update CI workflow to use self-hosted runner and paths * Update Docker image in CI workflow * Modify HPU installation URLs in run_ci_hpu.sh Updated the installation URL for paddle_intel_hpu and added paddlenlp_ops installation. * Fix paddle_intel_hpu installation URL Corrected the URL for paddle_intel_hpu wheel installation. --------- Signed-off-by: Luo, Focus <focus.luo@intel.com> Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
85
.github/workflows/ci_hpu.yml
vendored
Normal file
85
.github/workflows/ci_hpu.yml
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
name: CI_HPU
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- develop
|
||||
- 'release/*'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.event.pull_request.number }}-hpu-ci
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
CI_HPU:
|
||||
runs-on: [self-hosted, HPU-8Card]
|
||||
steps:
|
||||
- name: Print current runner name
|
||||
run: |
|
||||
echo "Current runner name: ${{ runner.name }}"
|
||||
|
||||
- name: Code Checkout
|
||||
env:
|
||||
docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
run: |
|
||||
REPO="https://github.com/${{ github.repository }}.git"
|
||||
FULL_REPO="${{ github.repository }}"
|
||||
REPO_NAME="${FULL_REPO##*/}"
|
||||
BASE_BRANCH="${{ github.base_ref }}"
|
||||
# Clean the repository directory before starting
|
||||
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
|
||||
-e "REPO_NAME=${REPO_NAME}" \
|
||||
-e "BASE_BRANCH=${BASE_BRANCH}" \
|
||||
${docker_image} /bin/bash -c '
|
||||
if [ -d ${REPO_NAME} ]; then
|
||||
echo "Directory ${REPO_NAME} exists, removing it..."
|
||||
rm -rf ${REPO_NAME}
|
||||
fi
|
||||
'
|
||||
git config --global user.name "FastDeployCI"
|
||||
git config --global user.email "fastdeploy_ci@example.com"
|
||||
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
|
||||
cd FastDeploy
|
||||
if [ "${{ github.event_name }}" = "pull_request" ]; then
|
||||
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
|
||||
git merge pr/${{ github.event.pull_request.number }}
|
||||
git log -n 3 --oneline
|
||||
else
|
||||
git checkout ${{ github.sha }}
|
||||
git log -n 3 --oneline
|
||||
fi
|
||||
|
||||
- name: Run CI unittest
|
||||
env:
|
||||
docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
|
||||
run: |
|
||||
runner_name="${{ runner.name }}"
|
||||
last_char="${runner_name: -1}"
|
||||
|
||||
if [[ "$last_char" =~ [0-3] ]]; then
|
||||
hpu_id="$last_char"
|
||||
else
|
||||
hpu_id="0"
|
||||
fi
|
||||
FD_API_PORT=8388
|
||||
FD_ENGINE_QUEUE_PORT=8902
|
||||
FD_METRICS_PORT=8202
|
||||
|
||||
PARENT_DIR=$(dirname "$WORKSPACE")
|
||||
echo "PARENT_DIR:$PARENT_DIR"
|
||||
docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \
|
||||
-v $(pwd):/workspace -w /workspace \
|
||||
-v "/ssd1:/ssd1" \
|
||||
-e "MODEL_PATH=/ssd1" \
|
||||
-e "http_proxy=$(git config --global --get http.proxy)" \
|
||||
-e "https_proxy=$(git config --global --get https.proxy)" \
|
||||
-e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24" \
|
||||
-e "FD_API_PORT=${FD_API_PORT}" \
|
||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||
${docker_image} /bin/bash -c "
|
||||
git config --global --add safe.directory /workspace/FastDeploy
|
||||
cd FastDeploy
|
||||
bash scripts/run_ci_hpu.sh
|
||||
"
|
||||
@@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
|
||||
#WARMUP Enabled
|
||||
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
|
||||
|
||||
#WARMUP Disabled
|
||||
HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}'
|
||||
```
|
||||
|
||||
### 2. Launch the request
|
||||
|
||||
@@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
|
||||
#WARMUP Enabled
|
||||
HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
|
||||
|
||||
#WARMUP Disabled
|
||||
HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}'
|
||||
```
|
||||
|
||||
### 2. 发送请求
|
||||
|
||||
154
scripts/run_ci_hpu.sh
Executable file
154
scripts/run_ci_hpu.sh
Executable file
@@ -0,0 +1,154 @@
|
||||
#!/bin/bash
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
echo "$DIR"
|
||||
|
||||
#install dependencies
|
||||
apt install -y lsof
|
||||
|
||||
export FD_API_PORT=8388
|
||||
export FD_ENGINE_QUEUE_PORT=8902
|
||||
export FD_METRICS_PORT=8202
|
||||
|
||||
#release relative resource
|
||||
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
lsof -t -i :$FD_API_PORT | xargs kill -9 || true
|
||||
|
||||
echo "pip requirements"
|
||||
python -m pip install -r requirements.txt
|
||||
|
||||
echo "uninstall org"
|
||||
#to uninstall PaddleCustomDevie (paddle-intel-hpu)
|
||||
python -m pip uninstall paddle-intel-hpu -y
|
||||
#to uninstall fastdeploy
|
||||
python -m pip uninstall fastdeploy_intel_hpu -y
|
||||
python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
#to install paddlepaddle
|
||||
pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
#to install paddlecustomdevice? (paddle-intel-hpu)
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
#to build and install fastdeploy
|
||||
echo "build whl"
|
||||
wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/DeepGEMM.tar.gz && tar -xzf DeepGEMM.tar.gz -C custom_ops/third_party/
|
||||
wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/cutlass.tar.gz && tar -xzf cutlass.tar.gz -C custom_ops/third_party/
|
||||
wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/json.tar.gz && tar -xzf json.tar.gz -C custom_ops/third_party/ && mv custom_ops/third_party/json custom_ops/third_party/nlohmann_json
|
||||
chmod +x build.sh
|
||||
bash build.sh || exit 1
|
||||
pip install dist/fastdeploy_intel_hpu-2.3.0.dev0-py3-none-any.whl --force-reinstall
|
||||
|
||||
#to install dependencies
|
||||
echo "pip others"
|
||||
pip install numpy
|
||||
pip install requests
|
||||
pip install tqdm
|
||||
pip install ddt
|
||||
pip install gradio
|
||||
pip install aistudio-sdk
|
||||
pip install pytest
|
||||
|
||||
#start serving
|
||||
rm -rf log/*
|
||||
rm -f server.log
|
||||
#clear the message queue
|
||||
ipcrm --all=msg
|
||||
|
||||
#start server
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
export FLAGS_intel_hpu_recipe_cache_num=20480
|
||||
export HABANA_PROFILE=0
|
||||
|
||||
#no proxy using
|
||||
unset http_proxy
|
||||
unset https_proxy
|
||||
unset no_proxy
|
||||
|
||||
echo "MODEL_PATH=${MODEL_PATH}"
|
||||
#currently Fastdepoly PR testing is working together with PaddleCostomDevice PR testing on a same Intel HPUs Machine
|
||||
#ERNIE-4.5-300B-A47B-Paddl will use all HPUS (8HPUs) and will block PaddleCostomDevice PR testing
|
||||
#so let us to use ERNIE-4.5-21B-A3B-Paddle firstly, which only needs 1 HPU
|
||||
FD_ATTENTION_BACKEND_NAME="HPU_ATTN"
|
||||
#ERNIE-4.5-300B-A47B-Paddle (300B)
|
||||
ENABLE_TESTING_ERNIE45_300B_A47B_Paddle=0
|
||||
if [ $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then
|
||||
export model_path=${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle
|
||||
export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &"
|
||||
HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &
|
||||
fi
|
||||
|
||||
#ERNIE-4.5-21B-A3B-Paddle (21B)
|
||||
ENABLE_TESTING_ERNIE45_21B_A3B_Paddle=1
|
||||
if [ $ENABLE_TESTING_ERNIE45_21B_A3B_Paddle -eq 1 ]; then
|
||||
export model_path=${MODEL_PATH}/ERNIE-4.5-21B-A3B-Paddle/
|
||||
export HPU_VISIBLE_DEVICES=3
|
||||
echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &"
|
||||
HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &
|
||||
fi
|
||||
|
||||
sleep 60
|
||||
#checking serving active status
|
||||
TIMEOUT=$((60 * 60)) #60min
|
||||
INTERVAL=10 #check each 10s
|
||||
ENDPOINT="http://0.0.0.0:$FD_API_PORT/health"
|
||||
START_TIME=$(date +%s) #start time
|
||||
echo "Start to check the serving active status, waiting total ${TIMEOUT} seconds"
|
||||
while true; do
|
||||
#calculate time
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - START_TIME))
|
||||
|
||||
#to check timeout
|
||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||
echo -e "\nstart serving failed with timeout: $((TIMEOUT/60)) seconds"
|
||||
cat server.log
|
||||
#ERNIE-4.5-21B-A3B-Paddle only has workerlog.0
|
||||
cat log/workerlog.0
|
||||
#ERNIE-4.5-300B-A47B-Paddle (300B) will have 8 workerlog
|
||||
if [ $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then
|
||||
cat log/workerlog.1
|
||||
cat log/workerlog.2
|
||||
cat log/workerlog.3
|
||||
cat log/workerlog.4
|
||||
cat log/workerlog.5
|
||||
cat log/workerlog.6
|
||||
cat log/workerlog.7
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo -e "\nserving start successfully! it costs total ${ELAPSED} seconds"
|
||||
break
|
||||
else
|
||||
echo -e "$(date +%F_%H:%M:%S) checking serving start status......"
|
||||
sleep $INTERVAL
|
||||
fi
|
||||
done
|
||||
|
||||
cat server.log
|
||||
|
||||
#to do serving inference
|
||||
echo "Start inference testing..."
|
||||
python -m pytest tests/ci_use/HPU/run_ernie.py
|
||||
exit_code=$?
|
||||
echo exit_code is ${exit_code}
|
||||
|
||||
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
lsof -t -i :$FD_API_PORT | xargs kill -9 || true
|
||||
|
||||
if [ ${exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
echo "mold testing failed, please help to do check for your PR source codeing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
46
tests/ci_use/HPU/run_ernie.py
Normal file
46
tests/ci_use/HPU/run_ernie.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
import openai
|
||||
|
||||
|
||||
def test_hpu():
|
||||
ip = "0.0.0.0"
|
||||
service_http_port = os.getenv("FD_API_PORT", "8388") # service port
|
||||
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
|
||||
|
||||
# chat
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{"role": "user", "content": "The largest ocean is"},
|
||||
],
|
||||
temperature=1,
|
||||
top_p=0,
|
||||
max_tokens=64,
|
||||
stream=False,
|
||||
)
|
||||
print(f"response is: {response}", flush=True)
|
||||
|
||||
generate_context = response.choices[0].message.content
|
||||
print(f"\ngenerate_context is: {generate_context}", flush=True)
|
||||
|
||||
assert "pacific ocean" in generate_context.lower(), "The answer was incorrect!"
|
||||
print("Test successfully!", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_hpu()
|
||||
Reference in New Issue
Block a user