diff --git a/.github/workflows/ci_hpu.yml b/.github/workflows/ci_hpu.yml new file mode 100644 index 000000000..039acd909 --- /dev/null +++ b/.github/workflows/ci_hpu.yml @@ -0,0 +1,85 @@ +name: CI_HPU + +on: + pull_request: + branches: + - develop + - 'release/*' + workflow_dispatch: + +concurrency: + group: ${{ github.event.pull_request.number }}-hpu-ci + cancel-in-progress: true + +jobs: + CI_HPU: + runs-on: [self-hosted, HPU-8Card] + steps: + - name: Print current runner name + run: | + echo "Current runner name: ${{ runner.name }}" + + - name: Code Checkout + env: + docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest + run: | + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + -e "BASE_BRANCH=${BASE_BRANCH}" \ + ${docker_image} /bin/bash -c ' + if [ -d ${REPO_NAME} ]; then + echo "Directory ${REPO_NAME} exists, removing it..." + rm -rf ${REPO_NAME} + fi + ' + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH} + cd FastDeploy + if [ "${{ github.event_name }}" = "pull_request" ]; then + git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }} + git merge pr/${{ github.event.pull_request.number }} + git log -n 3 --oneline + else + git checkout ${{ github.sha }} + git log -n 3 --oneline + fi + + - name: Run CI unittest + env: + docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest + run: | + runner_name="${{ runner.name }}" + last_char="${runner_name: -1}" + + if [[ "$last_char" =~ [0-3] ]]; then + hpu_id="$last_char" + else + hpu_id="0" + fi + FD_API_PORT=8388 + FD_ENGINE_QUEUE_PORT=8902 + FD_METRICS_PORT=8202 + + PARENT_DIR=$(dirname "$WORKSPACE") + echo "PARENT_DIR:$PARENT_DIR" + docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "/ssd1:/ssd1" \ + -e "MODEL_PATH=/ssd1" \ + -e "http_proxy=$(git config --global --get http.proxy)" \ + -e "https_proxy=$(git config --global --get https.proxy)" \ + -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24" \ + -e "FD_API_PORT=${FD_API_PORT}" \ + -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ + -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + ${docker_image} /bin/bash -c " + git config --global --add safe.directory /workspace/FastDeploy + cd FastDeploy + bash scripts/run_ci_hpu.sh + " diff --git a/docs/get_started/installation/intel_gaudi.md b/docs/get_started/installation/intel_gaudi.md index 3696e5d46..b27894b12 100644 --- a/docs/get_started/installation/intel_gaudi.md +++ b/docs/get_started/installation/intel_gaudi.md @@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu export HABANA_PROFILE=0 export HPU_VISIBLE_DEVICES=0 +#WARMUP Enabled HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 + +#WARMUP Disabled +HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}' ``` ### 2. Launch the request diff --git a/docs/zh/get_started/installation/intel_gaudi.md b/docs/zh/get_started/installation/intel_gaudi.md index e769ee814..2af2939fd 100644 --- a/docs/zh/get_started/installation/intel_gaudi.md +++ b/docs/zh/get_started/installation/intel_gaudi.md @@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu export HABANA_PROFILE=0 export HPU_VISIBLE_DEVICES=0 +#WARMUP Enabled HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 + +#WARMUP Disabled +HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}' ``` ### 2. 发送请求 diff --git a/scripts/run_ci_hpu.sh b/scripts/run_ci_hpu.sh new file mode 100755 index 000000000..f6fd8f4c0 --- /dev/null +++ b/scripts/run_ci_hpu.sh @@ -0,0 +1,154 @@ +#!/bin/bash +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +echo "$DIR" + +#install dependencies +apt install -y lsof + +export FD_API_PORT=8388 +export FD_ENGINE_QUEUE_PORT=8902 +export FD_METRICS_PORT=8202 + +#release relative resource +ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true +ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true +lsof -t -i :$FD_API_PORT | xargs kill -9 || true + +echo "pip requirements" +python -m pip install -r requirements.txt + +echo "uninstall org" +#to uninstall PaddleCustomDevie (paddle-intel-hpu) +python -m pip uninstall paddle-intel-hpu -y +#to uninstall fastdeploy +python -m pip uninstall fastdeploy_intel_hpu -y +python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +#to install paddlepaddle +pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ +#to install paddlecustomdevice? (paddle-intel-hpu) +pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl +pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl + +#to build and install fastdeploy +echo "build whl" +wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/DeepGEMM.tar.gz && tar -xzf DeepGEMM.tar.gz -C custom_ops/third_party/ +wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/cutlass.tar.gz && tar -xzf cutlass.tar.gz -C custom_ops/third_party/ +wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/json.tar.gz && tar -xzf json.tar.gz -C custom_ops/third_party/ && mv custom_ops/third_party/json custom_ops/third_party/nlohmann_json +chmod +x build.sh +bash build.sh || exit 1 +pip install dist/fastdeploy_intel_hpu-2.3.0.dev0-py3-none-any.whl --force-reinstall + +#to install dependencies +echo "pip others" +pip install numpy +pip install requests +pip install tqdm +pip install ddt +pip install gradio +pip install aistudio-sdk +pip install pytest + +#start serving +rm -rf log/* +rm -f server.log +#clear the message queue +ipcrm --all=msg + +#start server +export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH +export PADDLE_DISTRI_BACKEND=xccl +export PADDLE_XCCL_BACKEND=intel_hpu +export FLAGS_intel_hpu_recipe_cache_num=20480 +export HABANA_PROFILE=0 + +#no proxy using +unset http_proxy +unset https_proxy +unset no_proxy + +echo "MODEL_PATH=${MODEL_PATH}" +#currently Fastdepoly PR testing is working together with PaddleCostomDevice PR testing on a same Intel HPUs Machine +#ERNIE-4.5-300B-A47B-Paddl will use all HPUS (8HPUs) and will block PaddleCostomDevice PR testing +#so let us to use ERNIE-4.5-21B-A3B-Paddle firstly, which only needs 1 HPU +FD_ATTENTION_BACKEND_NAME="HPU_ATTN" +#ERNIE-4.5-300B-A47B-Paddle (300B) +ENABLE_TESTING_ERNIE45_300B_A47B_Paddle=0 +if [ $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then + export model_path=${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle + export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &" + HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 & +fi + +#ERNIE-4.5-21B-A3B-Paddle (21B) +ENABLE_TESTING_ERNIE45_21B_A3B_Paddle=1 +if [ $ENABLE_TESTING_ERNIE45_21B_A3B_Paddle -eq 1 ]; then + export model_path=${MODEL_PATH}/ERNIE-4.5-21B-A3B-Paddle/ + export HPU_VISIBLE_DEVICES=3 + echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &" + HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 & +fi + +sleep 60 +#checking serving active status +TIMEOUT=$((60 * 60)) #60min +INTERVAL=10 #check each 10s +ENDPOINT="http://0.0.0.0:$FD_API_PORT/health" +START_TIME=$(date +%s) #start time +echo "Start to check the serving active status, waiting total ${TIMEOUT} seconds" +while true; do + #calculate time + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - START_TIME)) + + #to check timeout + if [ $ELAPSED -ge $TIMEOUT ]; then + echo -e "\nstart serving failed with timeout: $((TIMEOUT/60)) seconds" + cat server.log + #ERNIE-4.5-21B-A3B-Paddle only has workerlog.0 + cat log/workerlog.0 + #ERNIE-4.5-300B-A47B-Paddle (300B) will have 8 workerlog + if [ $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then + cat log/workerlog.1 + cat log/workerlog.2 + cat log/workerlog.3 + cat log/workerlog.4 + cat log/workerlog.5 + cat log/workerlog.6 + cat log/workerlog.7 + fi + exit 1 + fi + + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true) + + if [ "$HTTP_CODE" = "200" ]; then + echo -e "\nserving start successfully! it costs total ${ELAPSED} seconds" + break + else + echo -e "$(date +%F_%H:%M:%S) checking serving start status......" + sleep $INTERVAL + fi +done + +cat server.log + +#to do serving inference +echo "Start inference testing..." +python -m pytest tests/ci_use/HPU/run_ernie.py +exit_code=$? +echo exit_code is ${exit_code} + +ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true +ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true +lsof -t -i :$FD_API_PORT | xargs kill -9 || true + +if [ ${exit_code} -ne 0 ]; then + echo "log/workerlog.0" + cat log/workerlog.0 + echo "mold testing failed, please help to do check for your PR source codeing" + exit 1 +fi + +sleep 5 diff --git a/tests/ci_use/HPU/run_ernie.py b/tests/ci_use/HPU/run_ernie.py new file mode 100644 index 000000000..979322541 --- /dev/null +++ b/tests/ci_use/HPU/run_ernie.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import openai + + +def test_hpu(): + ip = "0.0.0.0" + service_http_port = os.getenv("FD_API_PORT", "8388") # service port + client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") + + # chat + response = client.chat.completions.create( + model="default", + messages=[ + {"role": "user", "content": "The largest ocean is"}, + ], + temperature=1, + top_p=0, + max_tokens=64, + stream=False, + ) + print(f"response is: {response}", flush=True) + + generate_context = response.choices[0].message.content + print(f"\ngenerate_context is: {generate_context}", flush=True) + + assert "pacific ocean" in generate_context.lower(), "The answer was incorrect!" + print("Test successfully!", flush=True) + + +if __name__ == "__main__": + test_hpu()