[INTEL_HPU] [CI] enabled fastdeploy PR testing (#4596)

* [INTEL HPU] added hpu ci work flow support Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] added run ci hpu test scripts Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] enabled HPU ernie test case Signed-off-by: Luo, Focus <focus.luo@intel.com> * [INTEL HPU] updated Intel Gaudi Readme with Warmup disable cmdline Signed-off-by: Luo, Focus <focus.luo@intel.com> * Modify paddlepaddle installation command Updated paddlepaddle installation command to use a specific index URL. * Update run_ci_hpu.sh * Rename json directory to nlohmann_json Rename extracted json directory to nlohmann_json. * Update ci_hpu.yml * Set pip global index URL to Tsinghua mirror * Update CI workflow to use self-hosted runner and paths * Update Docker image in CI workflow * Modify HPU installation URLs in run_ci_hpu.sh Updated the installation URL for paddle_intel_hpu and added paddlenlp_ops installation. * Fix paddle_intel_hpu installation URL Corrected the URL for paddle_intel_hpu wheel installation. --------- Signed-off-by: Luo, Focus <focus.luo@intel.com> Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-17 19:24:41 +08:00
parent b23e684b67
commit c2c1942db9
5 changed files with 293 additions and 0 deletions
--- a/.github/workflows/ci_hpu.yml
+++ b/.github/workflows/ci_hpu.yml
@@ -0,0 +1,85 @@
+name: CI_HPU
+
+on:
+  pull_request:
+    branches:
+      - develop
+      - 'release/*'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.event.pull_request.number }}-hpu-ci
+  cancel-in-progress: true
+
+jobs:
+  CI_HPU:
+    runs-on: [self-hosted, HPU-8Card]
+    steps:
+      - name: Print current runner name
+        run: |
+          echo "Current runner name: ${{ runner.name }}"
+
+      - name: Code Checkout
+        env:
+          docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+        run: |
+          REPO="https://github.com/${{ github.repository }}.git"
+          FULL_REPO="${{ github.repository }}"
+          REPO_NAME="${FULL_REPO##*/}"
+          BASE_BRANCH="${{ github.base_ref }}"
+          # Clean the repository directory before starting
+          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+          -e "REPO_NAME=${REPO_NAME}" \
+          -e "BASE_BRANCH=${BASE_BRANCH}" \
+          ${docker_image} /bin/bash -c '
+            if [ -d ${REPO_NAME} ]; then
+              echo "Directory ${REPO_NAME} exists, removing it..."
+              rm -rf ${REPO_NAME}
+            fi
+          '
+          git config --global user.name "FastDeployCI"
+          git config --global user.email "fastdeploy_ci@example.com"
+          git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
+          cd FastDeploy
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
+            git merge pr/${{ github.event.pull_request.number }}
+            git log -n 3 --oneline
+          else
+            git checkout ${{ github.sha }}
+            git log -n 3 --oneline
+          fi
+
+      - name: Run CI unittest
+        env:
+          docker_image: vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+        run: |
+          runner_name="${{ runner.name }}"
+          last_char="${runner_name: -1}"
+
+          if [[ "$last_char" =~ [0-3] ]]; then
+            hpu_id="$last_char"
+          else
+            hpu_id="0"
+          fi
+          FD_API_PORT=8388
+          FD_ENGINE_QUEUE_PORT=8902
+          FD_METRICS_PORT=8202
+
+          PARENT_DIR=$(dirname "$WORKSPACE")
+          echo "PARENT_DIR:$PARENT_DIR"
+          docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G  \
+          -v $(pwd):/workspace -w /workspace \
+          -v "/ssd1:/ssd1" \
+          -e "MODEL_PATH=/ssd1" \
+          -e "http_proxy=$(git config --global --get http.proxy)" \
+          -e "https_proxy=$(git config --global --get https.proxy)" \
+          -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24" \
+          -e "FD_API_PORT=${FD_API_PORT}" \
+          -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+          -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+           ${docker_image} /bin/bash -c "
+          git config --global --add safe.directory /workspace/FastDeploy
+          cd FastDeploy
+          bash scripts/run_ci_hpu.sh
+          "
--- a/docs/get_started/installation/intel_gaudi.md
+++ b/docs/get_started/installation/intel_gaudi.md
@@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu
 export HABANA_PROFILE=0
 export HPU_VISIBLE_DEVICES=0

+#WARMUP Enabled
 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
+
+#WARMUP Disabled
+HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}'
 ```

 ### 2. Launch the request
--- a/docs/zh/get_started/installation/intel_gaudi.md
+++ b/docs/zh/get_started/installation/intel_gaudi.md
@@ -57,7 +57,11 @@ export PADDLE_XCCL_BACKEND=intel_hpu
 export HABANA_PROFILE=0
 export HPU_VISIBLE_DEVICES=0

+#WARMUP Enabled
 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
+
+#WARMUP Disabled
+HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128 --graph-optimization-config '{"use_cudagraph":false}'
 ```

 ### 2. 发送请求
--- a/scripts/run_ci_hpu.sh
+++ b/scripts/run_ci_hpu.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+echo "$DIR"
+
+#install dependencies
+apt install -y lsof
+
+export FD_API_PORT=8388
+export FD_ENGINE_QUEUE_PORT=8902
+export FD_METRICS_PORT=8202
+
+#release relative resource
+ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+lsof -t -i :$FD_API_PORT | xargs kill -9 || true
+
+echo "pip requirements"
+python -m pip install -r requirements.txt
+
+echo "uninstall org"
+#to uninstall PaddleCustomDevie (paddle-intel-hpu)
+python -m pip uninstall paddle-intel-hpu -y
+#to uninstall fastdeploy
+python -m pip uninstall fastdeploy_intel_hpu -y
+python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+#to install paddlepaddle
+pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+#to install paddlecustomdevice? (paddle-intel-hpu)
+pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl
+pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl
+
+#to build and install fastdeploy
+echo "build whl"
+wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/DeepGEMM.tar.gz && tar -xzf DeepGEMM.tar.gz -C custom_ops/third_party/
+wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/cutlass.tar.gz && tar -xzf cutlass.tar.gz -C custom_ops/third_party/
+wget -q https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/third-party/json.tar.gz && tar -xzf json.tar.gz -C custom_ops/third_party/ && mv custom_ops/third_party/json custom_ops/third_party/nlohmann_json
+chmod +x build.sh
+bash build.sh || exit 1
+pip install dist/fastdeploy_intel_hpu-2.3.0.dev0-py3-none-any.whl --force-reinstall
+
+#to install dependencies
+echo "pip others"
+pip install numpy
+pip install requests
+pip install tqdm
+pip install ddt
+pip install gradio
+pip install aistudio-sdk
+pip install pytest
+
+#start serving
+rm -rf log/*
+rm -f server.log
+#clear the message queue
+ipcrm --all=msg
+
+#start server
+export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
+export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
+export PADDLE_DISTRI_BACKEND=xccl
+export PADDLE_XCCL_BACKEND=intel_hpu
+export FLAGS_intel_hpu_recipe_cache_num=20480
+export HABANA_PROFILE=0
+
+#no proxy using
+unset http_proxy
+unset https_proxy
+unset no_proxy
+
+echo "MODEL_PATH=${MODEL_PATH}"
+#currently Fastdepoly PR testing is working together with PaddleCostomDevice PR testing on a same Intel HPUs Machine
+#ERNIE-4.5-300B-A47B-Paddl will use all HPUS (8HPUs) and will block PaddleCostomDevice PR testing
+#so let us to use ERNIE-4.5-21B-A3B-Paddle firstly, which only needs 1 HPU
+FD_ATTENTION_BACKEND_NAME="HPU_ATTN"
+#ERNIE-4.5-300B-A47B-Paddle (300B)
+ENABLE_TESTING_ERNIE45_300B_A47B_Paddle=0
+if [  $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then
+    export model_path=${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle
+    export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &"
+    HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --kv-cache-ratio 0.98 --num-gpu-blocks-override 3200 --tensor-parallel-size 8 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &
+fi
+
+#ERNIE-4.5-21B-A3B-Paddle (21B)
+ENABLE_TESTING_ERNIE45_21B_A3B_Paddle=1
+if [  $ENABLE_TESTING_ERNIE45_21B_A3B_Paddle -eq 1 ]; then
+    export model_path=${MODEL_PATH}/ERNIE-4.5-21B-A3B-Paddle/
+    export HPU_VISIBLE_DEVICES=3
+    echo "CMD Line: HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &"
+    HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=$FD_ATTENTION_BACKEND_NAME python -m fastdeploy.entrypoints.openai.api_server --model $model_path --port $FD_API_PORT --engine-worker-queue-port $FD_ENGINE_QUEUE_PORT --metrics-port $FD_METRICS_PORT --tensor-parallel-size 1 --max-model-len 32786 --max-num-seqs 128 --block-size 128 --graph-optimization-config '{"use_cudagraph":false}' > server.log 2>&1 &
+fi
+
+sleep 60
+#checking serving active status
+TIMEOUT=$((60 * 60)) #60min
+INTERVAL=10 #check each 10s
+ENDPOINT="http://0.0.0.0:$FD_API_PORT/health"
+START_TIME=$(date +%s) #start time
+echo "Start to check the serving active status, waiting total ${TIMEOUT} seconds"
+while true; do
+    #calculate time
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+
+    #to check timeout
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\nstart serving failed with timeout: $((TIMEOUT/60)) seconds"
+        cat server.log
+	#ERNIE-4.5-21B-A3B-Paddle only has workerlog.0
+        cat log/workerlog.0
+	#ERNIE-4.5-300B-A47B-Paddle (300B) will have 8 workerlog
+	if [  $ENABLE_TESTING_ERNIE45_300B_A47B_Paddle -eq 1 ]; then
+            cat log/workerlog.1
+            cat log/workerlog.2
+            cat log/workerlog.3
+            cat log/workerlog.4
+            cat log/workerlog.5
+            cat log/workerlog.6
+            cat log/workerlog.7
+	fi
+        exit 1
+    fi
+
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\nserving start successfully! it costs total ${ELAPSED} seconds"
+        break
+    else
+	echo -e "$(date +%F_%H:%M:%S) checking serving start status......"
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+#to do serving inference
+echo "Start inference testing..."
+python -m pytest tests/ci_use/HPU/run_ernie.py
+exit_code=$?
+echo exit_code is ${exit_code}
+
+ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+ps -efww | grep -E $FD_API_PORT | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+lsof -t -i :$FD_API_PORT | xargs kill -9 || true
+
+if [ ${exit_code} -ne 0 ]; then
+    echo "log/workerlog.0"
+    cat log/workerlog.0
+    echo "mold testing failed, please help to do check for your PR source codeing"
+    exit 1
+fi
+
+sleep 5
--- a/tests/ci_use/HPU/run_ernie.py
+++ b/tests/ci_use/HPU/run_ernie.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import openai
+
+
+def test_hpu():
+    ip = "0.0.0.0"
+    service_http_port = os.getenv("FD_API_PORT", "8388")  # service port
+    client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+
+    # chat
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "user", "content": "The largest ocean is"},
+        ],
+        temperature=1,
+        top_p=0,
+        max_tokens=64,
+        stream=False,
+    )
+    print(f"response is: {response}", flush=True)
+
+    generate_context = response.choices[0].message.content
+    print(f"\ngenerate_context is: {generate_context}", flush=True)
+
+    assert "pacific ocean" in generate_context.lower(), "The answer was incorrect!"
+    print("Test successfully!", flush=True)
+
+
+if __name__ == "__main__":
+    test_hpu()