[XPU] [CI] Change CI ep test from offline to online (#4885)

* change CI ep test from offline to online

* add ep all2all ci's changes, from offline to online

* change env var in ep-all2all ci test

* add expected response for ep8tp8 all2all

* Adapt to CI refactoring and support dual-concurrent code execution

* Adapt to CI refactoring and support dual-concurrent, second

* Explicitly specify the #port

* change the startup method of all2all

* Modify the command of all2all

* Update assertion to check multiple keywords

* Update assertion to check multiple keywords

* Update run_w4a8.py

* Update run_w4a8.py

---------

Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
zccjjj
2025-11-13 16:15:45 +08:00
committed by GitHub
parent 4a0d881e15
commit 88da9d9788
4 changed files with 240 additions and 67 deletions

View File

@@ -11,6 +11,9 @@ function stop_processes() {
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
ps -efww | grep -E "$((8188 + GPU_ID * 100))" | grep -v grep | awk '{print $2}' | xargs kill -9 || true
lsof -t -i :$((8188 + GPU_ID * 100)) | xargs kill -9 || true
for port in {$((8188 + GPU_ID * 100 + 10))..$((8188 + GPU_ID * 100 + 40))}; do
lsof -t -i :${port} | xargs kill -9 || true
done
}
stop_processes
@@ -286,10 +289,11 @@ if [ ${vl_test_exit_code} -ne 0 ]; then
fi
echo "============================开始 EP8TP1 测试!============================"
echo "============================开始 EP4TP4 在线服务测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$GPU_ID" == "0" ]]; then
@@ -312,11 +316,58 @@ cd xDeepEP
bash build.sh
cd -
export enable_expert_parallel=1
export enable_tensor_parallel=0
export port_num=$((8188 + GPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--data-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port $((port_num + 10)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--disable-sequence-parallel-moe \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
ep_exit_code=$?
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
cat server.log
# 执行在线推理验证脚本
python tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
@@ -327,18 +378,101 @@ unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
stop_processes
if [ ${ep_exit_code} -ne 0 ]; then
echo "log/workerlog.0"
if [ ${ep_online_exit_code} -ne 0 ]; then
cat log/workerlog.0
echo "EP8TP1 相关测试失败请检查pr代码"
echo "EP4TP4 在线服务相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始 EP8TP8 allreduce 测试!============================"
echo "============================开始 EP4TP1 在线服务测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$GPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export port_num=$((8188 + GPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 1 \
--enable-expert-parallel \
--data-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
sleep 60
# 探活(同上)
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
cat server.log
# 执行在线推理验证脚本
python tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
unset BKCL_TRACE_TOPO
unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
stop_processes
if [ ${ep_online_exit_code} -ne 0 ]; then
cat log/workerlog.0
echo "EP4TP1 在线服务相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始 EP4TP4 all2all 测试!============================"
sleep 5
rm -rf log/*
rm -f core*
# pkill -9 python #流水线不执行这个
ipcrm --all=msg
xpu-smi
if [[ "$GPU_ID" == "0" ]]; then
@@ -355,12 +489,57 @@ export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export enable_expert_parallel=1
export enable_tensor_parallel=1
export disable_sequence_parallel_moe=1
export port_num=$((8188 + GPU_ID * 100))
# 启动服务
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
--port $port_num \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--data-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--engine-worker-queue-port $((port_num + 10)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
--gpu-memory-utilization 0.9 \
--load-choices "default" > server.log 2>&1 &
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
ep_exit_code=$?
sleep 60
# 探活
TIMEOUT=$((15 * 60))
INTERVAL=10
ENDPOINT="http://0.0.0.0:${port_num}/health"
START_TIME=$(date +%s)
echo "开始服务健康检查,最长等待时间:${TIMEOUT}"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -ge $TIMEOUT ]; then
echo -e "\n服务启动超时经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
stop_processes
cat server.log
echo "log/workerlog.0"
cat log/workerlog.0
exit 1
fi
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
if [ "$HTTP_CODE" = "200" ]; then
echo -e "\n服务启动成功耗时 ${ELAPSED}"
break
else
sleep $INTERVAL
fi
done
cat server.log
# 执行在线推理验证脚本
python tests/ci_use/XPU_45T/run_ep_online.py
ep_online_exit_code=$?
echo ep_online_exit_code is ${ep_online_exit_code}
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
@@ -369,58 +548,10 @@ unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset enable_expert_parallel
unset enable_tensor_parallel
unset disable_sequence_parallel_moe
stop_processes
if [ ${ep_exit_code} -ne 0 ]; then
echo "log/workerlog.0"
if [ ${ep_online_exit_code} -ne 0 ]; then
cat log/workerlog.0
echo "EP8TP8 allreduce 相关测试失败请检查pr代码"
exit 1
fi
echo "============================开始 EP8TP8 all2all 测试!============================"
sleep 5
rm -rf log/*
rm -f core*
ipcrm --all=msg
xpu-smi
if [[ "$GPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
export BKCL_ENABLE_XDR=1
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
export BKCL_TRACE_TOPO=1
export BKCL_PCIE_RING=1
export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export enable_expert_parallel=1
export enable_tensor_parallel=1
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
ep_exit_code=$?
unset BKCL_ENABLE_XDR
unset BKCL_RDMA_NICS
unset BKCL_TRACE_TOPO
unset BKCL_PCIE_RING
unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset enable_expert_parallel
unset enable_tensor_parallel
stop_processes
if [ ${ep_exit_code} -ne 0 ]; then
echo "log/workerlog.0"
cat log/workerlog.0
echo "EP8TP8 all2all 相关测试失败请检查pr代码"
echo "EP4TP4 all2all 在线服务相关测试失败请检查pr代码"
exit 1
fi

View File

@@ -36,7 +36,7 @@ def test_45t():
)
print(response.choices[0].message.content)
# print(base_response)
assert "人工智能" in response.choices[0].message.content
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
if __name__ == "__main__":

View File

@@ -0,0 +1,42 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import openai
def test_ep():
ip = "0.0.0.0"
gpu_id = int(os.getenv("GPU_ID", "0"))
service_http_port = 8188 + gpu_id * 100 # 服务配置的
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
# 非流式对话
response = client.chat.completions.create(
model="default",
messages=[
{"role": "user", "content": "你好,你是谁?"},
],
temperature=1,
top_p=0,
max_tokens=64,
stream=False,
)
print(response.choices[0].message.content)
# print(base_response)
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
if __name__ == "__main__":
test_ep()

View File

@@ -36,7 +36,7 @@ def test_w4a8():
)
print(response.choices[0].message.content)
# print(base_response)
assert "人工智能" in response.choices[0].message.content
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
if __name__ == "__main__":