Files
FastDeploy/scripts/run_ci_xpu.sh
zccjjj e927c65742
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
[XPU] [Optimization] [EP] EP communication optimization. (#5145)
2025-12-05 10:03:45 +08:00

704 lines
21 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "$DIR"
#安装ci必要工具
apt install -y lsof
apt-get install -y iproute2
#先kill一遍
function stop_processes() {
ps -efww | grep -E 'cache_transfer_manager.py' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E "$((8188 + XPU_ID * 100))" | grep -v grep | awk '{print $2}' | xargs kill -9 || true
lsof -t -i :$((8188 + XPU_ID * 100)) | xargs kill -9 || true
for port in $(seq $((8188 + XPU_ID * 100 + 10)) $((8188 + XPU_ID * 100 + 40))); do
lsof -t -i :${port} | xargs kill -9 || true
done
netstat -tunlp 2>/dev/null | grep $((8190 + XPU_ID * 100)) | awk '{print $NF}' | awk -F'/' '{print $1}' | xargs -r kill -9
netstat -tunlp 2>/dev/null | grep $((8190 + XPU_ID * 100)) | awk '{print $(NF-1)}' | cut -d/ -f1 | grep -E '^[0-9]+$' | xargs -r kill -9
}
stop_processes >kill.log 2>&1
# 由于机器原因,需重启使用的卡,以保障没有问题
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
mkdir -p /workspace/deps
cd /workspace/deps
wget -q https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
cd -
export PATH=/workspace/deps/xre/bin:$PATH
xpu-smi -r -i $XPU_VISIBLE_DEVICES
xpu-smi
python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
echo "pip requirements"
python -m pip install -r requirements.txt
echo "uninstall org"
python -m pip uninstall paddlepaddle-xpu -y
python -m pip uninstall fastdeploy-xpu -y
# python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
# 由于ep并行报错暂时锁死paddle版本
python -m pip install https://paddle-whl.bj.bcebos.com/nightly/xpu-p800/paddlepaddle-xpu/paddlepaddle_xpu-3.3.0.dev20251123-cp310-cp310-linux_x86_64.whl
echo "build whl"
bash custom_ops/xpu_ops/download_dependencies.sh develop
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
bash build.sh || exit 1
echo "pip others"
python -m pip install openai -U
python -m pip uninstall -y triton
python -m pip install triton==3.3.0
python -m pip install pytest
python -m pip install pytest-timeout
unset http_proxy
unset https_proxy
unset no_proxy
stop_processes >kill.log 2>&1
# 起服务
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
#清空消息队列
ipcrm --all=msg
echo "============================开始V1模式测试!============================"
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export port_num=$((8188 + XPU_ID * 100))
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--engine-worker-queue-port $((port_num + 1)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--tensor-parallel-size 4 \
--num-gpu-blocks-override 16384 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization wint4 \
--enable-prefix-caching \
--enable-chunked-prefill > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行服务化推理
python -m pytest -s tests/ci_use/XPU_45T/run_45T.py
kv_block_test_exit_code=$?
echo kv_block_test_exit_code is ${kv_block_test_exit_code}
stop_processes >kill.log 2>&1
if [ ${kv_block_test_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
echo "kv block相关测试失败请检查pr代码"
exit 1
fi
sleep 5
# 起服务
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
#清空消息队列
ipcrm --all=msg
echo "============================开始W4A8测试!============================"
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export port_num=$((8188 + XPU_ID * 100))
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle \
--port $port_num \
--engine-worker-queue-port $((port_num + 1)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--tensor-parallel-size 4 \
--num-gpu-blocks-override 16384 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "W4A8" > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行服务化推理
python -m pytest -s tests/ci_use/XPU_45T/run_w4a8.py
w4a8_test_exit_code=$?
echo w4a8_test_exit_code is ${w4a8_test_exit_code}
stop_processes >kill.log 2>&1
if [ ${w4a8_test_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
echo "w4a8 测试失败请检查pr代码"
exit 1
fi
sleep 5
# 起服务
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
#清空消息队列
ipcrm --all=msg
echo "============================开始vl模型测试!============================"
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export port_num=$((8188 + XPU_ID * 100))
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-VL-28B-A3B-Thinking \
--port $port_num \
--engine-worker-queue-port $((port_num + 1)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 32 \
--quantization wint8 \
--reasoning-parser ernie-45-vl-thinking \
--tool-call-parser ernie-45-vl-thinking \
--mm-processor-kwargs '{"image_max_pixels": 12845056 }' \
--enable-chunked-prefill > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行服务化推理
python -m pytest -s tests/ci_use/XPU_45T/run_45vl.py
vl_test_exit_code=$?
echo vl_test_exit_code is ${vl_test_exit_code}
stop_processes >kill.log 2>&1
if [ ${vl_test_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
echo " vl模型 测试失败请检查pr代码"
exit 1
fi
echo "============================开始PD分离测试!============================"
export port_num=$((8188 + XPU_ID * 100))
# 起router
export FD_LOG_DIR="log_router"
mkdir -p ${FD_LOG_DIR}
nohup python -m fastdeploy.router.launch \
--port ${port_num} \
--splitwise \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1
# pd相关环境变量设置
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
export $(bash $DIR/get_rdma_nics.sh xpu)
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
export CUDA_ENABLE_P2P_NO_UVA=1 # 开启peer mem
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
exit 1
fi
# 起P节点
export FD_LOG_DIR="log_prefill"
mkdir -p ${FD_LOG_DIR}
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0"
else
export XPU_VISIBLE_DEVICES="4"
fi
nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-0.3B-Paddle \
--port $((port_num+11)) \
--metrics-port $((port_num+12)) \
--engine-worker-queue-port $((port_num+13)) \
--cache-queue-port $((port_num+14)) \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--splitwise-role "prefill" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports $((port_num+15)) \
--pd-comm-port $((port_num+16)) \
--router "0.0.0.0:$((port_num))" \
2>&1 >${FD_LOG_DIR}/nohup &
# 起D节点
export FD_LOG_DIR="log_decode"
mkdir -p ${FD_LOG_DIR}
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="1"
else
export XPU_VISIBLE_DEVICES="5"
fi
nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-0.3B-Paddle \
--port $((port_num+21)) \
--metrics-port $((port_num+22)) \
--engine-worker-queue-port $((port_num+23)) \
--cache-queue-port $((port_num+24)) \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--splitwise-role "decode" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports $((port_num+25)) \
--pd-comm-port $((port_num+26)) \
--router "0.0.0.0:${port_num}" \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 60
# 探活
TIMEOUT=$((10 * 60))
INTERVAL=10 # 检查间隔(秒)
ENDPOINT_P="http://0.0.0.0:$((port_num+11))/health"
ENDPOINT_D="http://0.0.0.0:$((port_num+21))/health"
START_TIME=$(date +%s) # 记录开始时间戳
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
# 计算已耗时
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
# 超时判断
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
echo "log_prefill/nohup"
cat log_prefill/nohup
echo "log_decode/nohup"
cat log_decode/nohup
exit 1
fi
HTTP_CODE_P=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT_P" || true)
HTTP_CODE_D=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT_D" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED}当前状态码P节点:${HTTP_CODE_P}D节点:${HTTP_CODE_D}"
if [ "$HTTP_CODE_P" = "200" ] && [ "$HTTP_CODE_D" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行服务化推理
python tests/ci_use/XPU_45T/run_pd.py
pd_test_exit_code=$?
echo pd_test_exit_code is ${pd_test_exit_code}
stop_processes >kill.log 2>&1
if [ ${pd_test_exit_code} -ne 0 ]; then
echo "log_prefill/nohup"
cat log_prefill/nohup
echo "log_decode/nohup"
cat log_decode/nohup
echo " pd分离模型 测试失败请检查pr代码"
exit 1
fi
# pd相关环境变量重置
unset KVCACHE_GDRCOPY_FLUSH_ENABLE
unset KVCACHE_RDMA_NICS
unset CUDA_ENABLE_P2P_NO_UVA
echo "============================开始 EP4TP4 在线服务测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export MOE_FFN_USE_DENSE_INPUT=1
wget -q https://paddle-qa.bj.bcebos.com/xpu_third_party/xDeepEP.tar.gz
tar -xzf xDeepEP.tar.gz
cd xDeepEP
bash build.sh
cd -
export port_num=$((8188 + XPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--data-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port $((port_num + 10)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--disable-sequence-parallel-moe \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行在线推理验证脚本
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
unset BKCL_TRACE_TOPO
unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset MOE_FFN_USE_DENSE_INPUT
stop_processes >kill.log 2>&1
if [ ${ep_online_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
cat log/workerlog.0
echo "EP4TP4 在线服务相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始 EP4TP1 在线服务测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export MOE_FFN_USE_DENSE_INPUT=1
export port_num=$((8188 + XPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 1 \
--enable-expert-parallel \
--data-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
sleep 60
# 探活(同上)
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行在线推理验证脚本
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
unset BKCL_TRACE_TOPO
unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset MOE_FFN_USE_DENSE_INPUT
stop_processes >kill.log 2>&1
if [ ${ep_online_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
cat log/workerlog.0
echo "EP4TP1 在线服务相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始 EP4TP4 all2all 测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export MOE_FFN_USE_DENSE_INPUT=1
export port_num=$((8188 + XPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--data-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port $((port_num + 10)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
# 执行在线推理验证脚本
python -m pytest -s tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
unset BKCL_TRACE_TOPO
unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset MOE_FFN_USE_DENSE_INPUT
stop_processes >kill.log 2>&1
if [ ${ep_online_exit_code} -ne 0 ]; then
echo "server.log"
cat server.log
cat log/workerlog.0
echo "EP4TP4 all2all 在线服务相关测试失败请检查pr代码"
exit 1
fi