Files
FastDeploy/scripts/run_ci_xpu.sh
plusNew001 918e4e9850
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
[XPU] Change XPU stable third-party version (#4524)
* add xpu ci case

* Add xDeepEP download and build steps

Download and build xDeepEP before running tests.

* Fix formatting and add missing sleep command

* Update Docker image version in CI workflow

* Modify run_ci_xpu.sh for log cleanup and error handling

Clean up log files before running tests and output worker log on failure.

* Enhance test_ep.py with process management and assertions

Refactor test function to include process cleanup and assertions.

* Replace test_fastdeploy_llm with test_fd_ep

* Fix conditional statement in run_ci_xpu.sh

* Update test_ep.py for string handling and formatting

Fix string encoding issues and improve readability.

* Rename test_ep.py to run_ep.py

* Change test script from test_ep.py to run_ep.py

* Update dependency versions for stable release

* Install pytest-timeout and modify test execution

Added pytest-timeout installation and updated test command.
2025-10-22 19:43:03 +08:00

210 lines
5.8 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "$DIR"
#安装lsof工具
apt install -y lsof
#先kill一遍
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
lsof -t -i :8188 | xargs kill -9 || true
#设置模型路径
export model_path=${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle
echo "pip requirements"
python -m pip install -r requirements.txt
echo "uninstall org"
python -m pip uninstall paddlepaddle-xpu -y
python -m pip uninstall fastdeploy-xpu -y
python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
echo "build whl"
bash custom_ops/xpu_ops/download_dependencies.sh develop
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
bash build.sh || exit 1
echo "pip others"
python -m pip install openai -U
python -m pip uninstall -y triton
python -m pip install triton==3.3.0
python -m pip install pytest
python -m pip install pytest-timeout
unset http_proxy
unset https_proxy
unset no_proxy
# 起服务
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
#清空消息队列
ipcrm --all=msg
echo "============================开始V0模式测试!============================"
export ENABLE_V1_KVCACHE_SCHEDULER=0
export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model ${model_path} \
--port 8188 \
--tensor-parallel-size 8 \
--num-gpu-blocks-override 16384 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization wint4 > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT="http://0.0.0.0:8188/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
cat server.log
cat log/workerlog.0
cat log/workerlog.1
cat log/workerlog.2
cat log/workerlog.3
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
cat server.log
# 执行服务化推理
python -m pytest tests/ci_use/XPU_45T/run_45T.py
exit_code=$?
echo exit_code is ${exit_code}
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
lsof -t -i :8188 | xargs kill -9 || true
if [ ${exit_code} -ne 0 ]; then
echo "log/workerlog.0"
cat log/workerlog.0
echo "模型起服务失败请检查pr代码"
exit 1
fi
sleep 5
#0731新增kv block集中式管理相关测试在起服务时启用对应环境变量 export ENABLE_V1_KVCACHE_SCHEDULER=True
# 起服务
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
#清空消息队列
ipcrm --all=msg
echo "============================开始V1模式测试!============================"
export ENABLE_V1_KVCACHE_SCHEDULER=1
export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model ${model_path} \
--port 8188 \
--tensor-parallel-size 8 \
--num-gpu-blocks-override 16384 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization wint4 > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT="http://0.0.0.0:8188/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
cat server.log
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
cat server.log
# 执行服务化推理
python -m pytest tests/ci_use/XPU_45T/run_45T.py
kv_block_test_exit_code=$?
echo kv_block_test_exit_code is ${kv_block_test_exit_code}
unset ENABLE_V1_KVCACHE_SCHEDULER
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E '8188' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
lsof -t -i :8188 | xargs kill -9 || true
if [ ${kv_block_test_exit_code} -ne 0 ]; then
echo "log/workerlog.0"
cat log/workerlog.0
echo "kv block相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始EP并行测试!============================"
sleep 5
rm -rf log/*
rm -f core*
xpu-smi
export XPU_VISIBLE_DEVICES="0,1,2,3"
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
wget -q https://paddle-qa.bj.bcebos.com/xpu_third_party/xDeepEP.tar.gz
tar -xzf xDeepEP.tar.gz
cd xDeepEP
bash build.sh
cd -
python -m pytest -s --timeout=300 tests/ci_use/XPU_45T/run_ep.py
ep_exit_code=$?
if [ ${ep_exit_code} -ne 0 ]; then
echo "log/workerlog.0"
cat log/workerlog.0
echo "EP并行 相关测试失败请检查pr代码"
exit 1
fi