mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[XPU] [CI] Change CI ep test from offline to online (#4885)
* change CI ep test from offline to online * add ep all2all ci's changes, from offline to online * change env var in ep-all2all ci test * add expected response for ep8tp8 all2all * Adapt to CI refactoring and support dual-concurrent code execution * Adapt to CI refactoring and support dual-concurrent, second * Explicitly specify the #port * change the startup method of all2all * Modify the command of all2all * Update assertion to check multiple keywords * Update assertion to check multiple keywords * Update run_w4a8.py * Update run_w4a8.py --------- Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
@@ -11,6 +11,9 @@ function stop_processes() {
|
||||
ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
ps -efww | grep -E "$((8188 + GPU_ID * 100))" | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
lsof -t -i :$((8188 + GPU_ID * 100)) | xargs kill -9 || true
|
||||
for port in {$((8188 + GPU_ID * 100 + 10))..$((8188 + GPU_ID * 100 + 40))}; do
|
||||
lsof -t -i :${port} | xargs kill -9 || true
|
||||
done
|
||||
}
|
||||
stop_processes
|
||||
|
||||
@@ -286,10 +289,11 @@ if [ ${vl_test_exit_code} -ne 0 ]; then
|
||||
fi
|
||||
|
||||
|
||||
echo "============================开始 EP8TP1 测试!============================"
|
||||
echo "============================开始 EP4TP4 在线服务测试!============================"
|
||||
sleep 5
|
||||
rm -rf log/*
|
||||
rm -f core*
|
||||
# pkill -9 python #流水线不执行这个
|
||||
ipcrm --all=msg
|
||||
xpu-smi
|
||||
if [[ "$GPU_ID" == "0" ]]; then
|
||||
@@ -312,11 +316,58 @@ cd xDeepEP
|
||||
bash build.sh
|
||||
cd -
|
||||
|
||||
export enable_expert_parallel=1
|
||||
export enable_tensor_parallel=0
|
||||
export port_num=$((8188 + GPU_ID * 100))
|
||||
# 启动服务
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port $port_num \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 64 \
|
||||
--quantization "wint4" \
|
||||
--engine-worker-queue-port $((port_num + 10)) \
|
||||
--metrics-port $((port_num + 2)) \
|
||||
--cache-queue-port $((port_num + 47873)) \
|
||||
--disable-sequence-parallel-moe \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--load-choices "default" > server.log 2>&1 &
|
||||
|
||||
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
|
||||
ep_exit_code=$?
|
||||
sleep 60
|
||||
# 探活
|
||||
TIMEOUT=$((15 * 60))
|
||||
INTERVAL=10
|
||||
ENDPOINT="http://0.0.0.0:${port_num}/health"
|
||||
START_TIME=$(date +%s)
|
||||
echo "开始服务健康检查,最长等待时间:${TIMEOUT}秒"
|
||||
while true; do
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - START_TIME))
|
||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
|
||||
stop_processes
|
||||
cat server.log
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
exit 1
|
||||
fi
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
|
||||
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo -e "\n服务启动成功!耗时 ${ELAPSED} 秒"
|
||||
break
|
||||
else
|
||||
sleep $INTERVAL
|
||||
fi
|
||||
done
|
||||
|
||||
cat server.log
|
||||
|
||||
# 执行在线推理验证脚本
|
||||
python tests/ci_use/XPU_45T/run_ep_online.py
|
||||
ep_online_exit_code=$?
|
||||
echo ep_online_exit_code is ${ep_online_exit_code}
|
||||
|
||||
unset BKCL_ENABLE_XDR
|
||||
unset BKCL_RDMA_NICS
|
||||
@@ -327,18 +378,101 @@ unset XSHMEM_QP_NUM_PER_RANK
|
||||
unset BKCL_RDMA_VERBS
|
||||
stop_processes
|
||||
|
||||
if [ ${ep_exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
if [ ${ep_online_exit_code} -ne 0 ]; then
|
||||
cat log/workerlog.0
|
||||
echo "EP8TP1 相关测试失败,请检查pr代码"
|
||||
echo "EP4TP4 在线服务相关测试失败,请检查pr代码"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
echo "============================开始 EP8TP8 allreduce 测试!============================"
|
||||
echo "============================开始 EP4TP1 在线服务测试!============================"
|
||||
sleep 5
|
||||
rm -rf log/*
|
||||
rm -f core*
|
||||
# pkill -9 python #流水线不执行这个
|
||||
ipcrm --all=msg
|
||||
xpu-smi
|
||||
if [[ "$GPU_ID" == "0" ]]; then
|
||||
export XPU_VISIBLE_DEVICES="0,1,2,3"
|
||||
else
|
||||
export XPU_VISIBLE_DEVICES="4,5,6,7"
|
||||
fi
|
||||
export BKCL_ENABLE_XDR=1
|
||||
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
|
||||
export BKCL_TRACE_TOPO=1
|
||||
export BKCL_PCIE_RING=1
|
||||
export XSHMEM_MODE=1
|
||||
export XSHMEM_QP_NUM_PER_RANK=32
|
||||
export BKCL_RDMA_VERBS=1
|
||||
|
||||
export port_num=$((8188 + GPU_ID * 100))
|
||||
# 启动服务
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port $port_num \
|
||||
--tensor-parallel-size 1 \
|
||||
--enable-expert-parallel \
|
||||
--data-parallel-size 4 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 64 \
|
||||
--quantization "wint4" \
|
||||
--engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
|
||||
--metrics-port $((port_num + 2)) \
|
||||
--cache-queue-port $((port_num + 47873)) \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--load-choices "default" > server.log 2>&1 &
|
||||
|
||||
sleep 60
|
||||
# 探活(同上)
|
||||
TIMEOUT=$((15 * 60))
|
||||
INTERVAL=10
|
||||
ENDPOINT="http://0.0.0.0:${port_num}/health"
|
||||
START_TIME=$(date +%s)
|
||||
while true; do
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - START_TIME))
|
||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
|
||||
stop_processes
|
||||
cat server.log
|
||||
cat log/workerlog.0
|
||||
exit 1
|
||||
fi
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo -e "\n服务启动成功!耗时 ${ELAPSED} 秒"
|
||||
break
|
||||
else
|
||||
sleep $INTERVAL
|
||||
fi
|
||||
done
|
||||
|
||||
cat server.log
|
||||
|
||||
# 执行在线推理验证脚本
|
||||
python tests/ci_use/XPU_45T/run_ep_online.py
|
||||
ep_online_exit_code=$?
|
||||
echo ep_online_exit_code is ${ep_online_exit_code}
|
||||
|
||||
unset BKCL_ENABLE_XDR
|
||||
unset BKCL_RDMA_NICS
|
||||
unset BKCL_TRACE_TOPO
|
||||
unset BKCL_PCIE_RING
|
||||
unset XSHMEM_MODE
|
||||
unset XSHMEM_QP_NUM_PER_RANK
|
||||
unset BKCL_RDMA_VERBS
|
||||
stop_processes
|
||||
|
||||
if [ ${ep_online_exit_code} -ne 0 ]; then
|
||||
cat log/workerlog.0
|
||||
echo "EP4TP1 在线服务相关测试失败,请检查pr代码"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "============================开始 EP4TP4 all2all 测试!============================"
|
||||
sleep 5
|
||||
rm -rf log/*
|
||||
rm -f core*
|
||||
# pkill -9 python #流水线不执行这个
|
||||
ipcrm --all=msg
|
||||
xpu-smi
|
||||
if [[ "$GPU_ID" == "0" ]]; then
|
||||
@@ -355,12 +489,57 @@ export XSHMEM_MODE=1
|
||||
export XSHMEM_QP_NUM_PER_RANK=32
|
||||
export BKCL_RDMA_VERBS=1
|
||||
|
||||
export enable_expert_parallel=1
|
||||
export enable_tensor_parallel=1
|
||||
export disable_sequence_parallel_moe=1
|
||||
export port_num=$((8188 + GPU_ID * 100))
|
||||
# 启动服务
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
|
||||
--port $port_num \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 64 \
|
||||
--quantization "wint4" \
|
||||
--engine-worker-queue-port $((port_num + 10)) \
|
||||
--metrics-port $((port_num + 2)) \
|
||||
--cache-queue-port $((port_num + 47873)) \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--load-choices "default" > server.log 2>&1 &
|
||||
|
||||
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
|
||||
ep_exit_code=$?
|
||||
sleep 60
|
||||
# 探活
|
||||
TIMEOUT=$((15 * 60))
|
||||
INTERVAL=10
|
||||
ENDPOINT="http://0.0.0.0:${port_num}/health"
|
||||
START_TIME=$(date +%s)
|
||||
echo "开始服务健康检查,最长等待时间:${TIMEOUT}秒"
|
||||
while true; do
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - START_TIME))
|
||||
if [ $ELAPSED -ge $TIMEOUT ]; then
|
||||
echo -e "\n服务启动超时:经过 $((TIMEOUT/60)) 分钟服务仍未启动!"
|
||||
stop_processes
|
||||
cat server.log
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
exit 1
|
||||
fi
|
||||
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
|
||||
echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒,当前状态码:${HTTP_CODE}"
|
||||
if [ "$HTTP_CODE" = "200" ]; then
|
||||
echo -e "\n服务启动成功!耗时 ${ELAPSED} 秒"
|
||||
break
|
||||
else
|
||||
sleep $INTERVAL
|
||||
fi
|
||||
done
|
||||
|
||||
cat server.log
|
||||
|
||||
# 执行在线推理验证脚本
|
||||
python tests/ci_use/XPU_45T/run_ep_online.py
|
||||
ep_online_exit_code=$?
|
||||
echo ep_online_exit_code is ${ep_online_exit_code}
|
||||
|
||||
unset BKCL_ENABLE_XDR
|
||||
unset BKCL_RDMA_NICS
|
||||
@@ -369,58 +548,10 @@ unset BKCL_PCIE_RING
|
||||
unset XSHMEM_MODE
|
||||
unset XSHMEM_QP_NUM_PER_RANK
|
||||
unset BKCL_RDMA_VERBS
|
||||
unset enable_expert_parallel
|
||||
unset enable_tensor_parallel
|
||||
unset disable_sequence_parallel_moe
|
||||
stop_processes
|
||||
|
||||
if [ ${ep_exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
if [ ${ep_online_exit_code} -ne 0 ]; then
|
||||
cat log/workerlog.0
|
||||
echo "EP8TP8 allreduce 相关测试失败,请检查pr代码"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
echo "============================开始 EP8TP8 all2all 测试!============================"
|
||||
sleep 5
|
||||
rm -rf log/*
|
||||
rm -f core*
|
||||
ipcrm --all=msg
|
||||
xpu-smi
|
||||
if [[ "$GPU_ID" == "0" ]]; then
|
||||
export XPU_VISIBLE_DEVICES="0,1,2,3"
|
||||
else
|
||||
export XPU_VISIBLE_DEVICES="4,5,6,7"
|
||||
fi
|
||||
export BKCL_ENABLE_XDR=1
|
||||
export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
|
||||
export BKCL_TRACE_TOPO=1
|
||||
export BKCL_PCIE_RING=1
|
||||
export XSHMEM_MODE=1
|
||||
export XSHMEM_QP_NUM_PER_RANK=32
|
||||
export BKCL_RDMA_VERBS=1
|
||||
|
||||
export enable_expert_parallel=1
|
||||
export enable_tensor_parallel=1
|
||||
|
||||
python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
|
||||
ep_exit_code=$?
|
||||
|
||||
unset BKCL_ENABLE_XDR
|
||||
unset BKCL_RDMA_NICS
|
||||
unset BKCL_TRACE_TOPO
|
||||
unset BKCL_PCIE_RING
|
||||
unset XSHMEM_MODE
|
||||
unset XSHMEM_QP_NUM_PER_RANK
|
||||
unset BKCL_RDMA_VERBS
|
||||
unset enable_expert_parallel
|
||||
unset enable_tensor_parallel
|
||||
stop_processes
|
||||
|
||||
if [ ${ep_exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
echo "EP8TP8 all2all 相关测试失败,请检查pr代码"
|
||||
echo "EP4TP4 all2all 在线服务相关测试失败,请检查pr代码"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -36,7 +36,7 @@ def test_45t():
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
# print(base_response)
|
||||
assert "人工智能" in response.choices[0].message.content
|
||||
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
42
tests/ci_use/XPU_45T/run_ep_online.py
Normal file
42
tests/ci_use/XPU_45T/run_ep_online.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
|
||||
import openai
|
||||
|
||||
|
||||
def test_ep():
|
||||
ip = "0.0.0.0"
|
||||
gpu_id = int(os.getenv("GPU_ID", "0"))
|
||||
service_http_port = 8188 + gpu_id * 100 # 服务配置的
|
||||
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
|
||||
# 非流式对话
|
||||
response = client.chat.completions.create(
|
||||
model="default",
|
||||
messages=[
|
||||
{"role": "user", "content": "你好,你是谁?"},
|
||||
],
|
||||
temperature=1,
|
||||
top_p=0,
|
||||
max_tokens=64,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
# print(base_response)
|
||||
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_ep()
|
||||
@@ -36,7 +36,7 @@ def test_w4a8():
|
||||
)
|
||||
print(response.choices[0].message.content)
|
||||
# print(base_response)
|
||||
assert "人工智能" in response.choices[0].message.content
|
||||
assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user