[XPU] [CI] Change CI ep test from offline to online (#4885)

* change CI ep test from offline to online * add ep all2all ci's changes, from offline to online * change env var in ep-all2all ci test * add expected response for ep8tp8 all2all * Adapt to CI refactoring and support dual-concurrent code execution * Adapt to CI refactoring and support dual-concurrent, second * Explicitly specify the #port * change the startup method of all2all * Modify the command of all2all * Update assertion to check multiple keywords * Update assertion to check multiple keywords * Update run_w4a8.py * Update run_w4a8.py --------- Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-13 16:15:45 +08:00
parent 4a0d881e15
commit 88da9d9788
4 changed files with 240 additions and 67 deletions
--- a/scripts/run_ci_xpu.sh
+++ b/scripts/run_ci_xpu.sh
@@ -11,6 +11,9 @@ function stop_processes() {
    ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
    ps -efww | grep -E "$((8188 + GPU_ID * 100))" | grep -v grep | awk '{print $2}' | xargs kill -9 || true
    lsof -t -i :$((8188 + GPU_ID * 100)) | xargs kill -9 || true
+    for port in {$((8188 + GPU_ID * 100 + 10))..$((8188 + GPU_ID * 100 + 40))}; do
+        lsof -t -i :${port} | xargs kill -9 || true
+    done
 }
 stop_processes

@@ -286,10 +289,11 @@ if [ ${vl_test_exit_code} -ne 0 ]; then
 fi


-echo "============================开始 EP8TP1 测试!============================"
+echo "============================开始 EP4TP4 在线服务测试!============================"
 sleep 5
 rm -rf log/*
 rm -f core*
+# pkill -9 python #流水线不执行这个
 ipcrm --all=msg
 xpu-smi
 if [[ "$GPU_ID" == "0" ]]; then
@@ -312,11 +316,58 @@ cd xDeepEP
 bash build.sh
 cd -

-export enable_expert_parallel=1
-export enable_tensor_parallel=0
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --data-parallel-size 1 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port $((port_num + 10)) \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --disable-sequence-parallel-moe \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &

-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
+sleep 60
+# 探活
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+echo "开始服务健康检查，最长等待时间：${TIMEOUT}秒"
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        echo "log/workerlog.0"
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒，当前状态码：${HTTP_CODE}"
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}

 unset BKCL_ENABLE_XDR
 unset BKCL_RDMA_NICS
@@ -327,18 +378,101 @@ unset XSHMEM_QP_NUM_PER_RANK
 unset BKCL_RDMA_VERBS
 stop_processes

-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
+if [ ${ep_online_exit_code} -ne 0 ]; then
    cat log/workerlog.0
-    echo "EP8TP1 相关测试失败，请检查pr代码"
+    echo "EP4TP4 在线服务相关测试失败，请检查pr代码"
    exit 1
 fi

-
-echo "============================开始 EP8TP8 allreduce 测试!============================"
+echo "============================开始 EP4TP1 在线服务测试!============================"
 sleep 5
 rm -rf log/*
 rm -f core*
+# pkill -9 python #流水线不执行这个
+ipcrm --all=msg
+xpu-smi
+if [[ "$GPU_ID" == "0" ]]; then
+    export XPU_VISIBLE_DEVICES="0,1,2,3"
+else
+    export XPU_VISIBLE_DEVICES="4,5,6,7"
+fi
+export BKCL_ENABLE_XDR=1
+export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
+export BKCL_TRACE_TOPO=1
+export BKCL_PCIE_RING=1
+export XSHMEM_MODE=1
+export XSHMEM_QP_NUM_PER_RANK=32
+export BKCL_RDMA_VERBS=1
+
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 1 \
+    --enable-expert-parallel \
+    --data-parallel-size 4 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port "$((port_num + 10)),$((port_num + 20)),$((port_num + 30)),$((port_num + 40))" \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &
+
+sleep 60
+# 探活（同上）
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}
+
+unset BKCL_ENABLE_XDR
+unset BKCL_RDMA_NICS
+unset BKCL_TRACE_TOPO
+unset BKCL_PCIE_RING
+unset XSHMEM_MODE
+unset XSHMEM_QP_NUM_PER_RANK
+unset BKCL_RDMA_VERBS
+stop_processes
+
+if [ ${ep_online_exit_code} -ne 0 ]; then
+    cat log/workerlog.0
+    echo "EP4TP1 在线服务相关测试失败，请检查pr代码"
+    exit 1
+fi
+
+echo "============================开始 EP4TP4 all2all 测试!============================"
+sleep 5
+rm -rf log/*
+rm -f core*
+# pkill -9 python #流水线不执行这个
 ipcrm --all=msg
 xpu-smi
 if [[ "$GPU_ID" == "0" ]]; then
@@ -355,12 +489,57 @@ export XSHMEM_MODE=1
 export XSHMEM_QP_NUM_PER_RANK=32
 export BKCL_RDMA_VERBS=1

-export enable_expert_parallel=1
-export enable_tensor_parallel=1
-export disable_sequence_parallel_moe=1
+export port_num=$((8188 + GPU_ID * 100))
+# 启动服务
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model ${MODEL_PATH}/ERNIE-4.5-300B-A47B-Paddle \
+    --port $port_num \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel \
+    --data-parallel-size 1 \
+    --max-model-len 32768 \
+    --max-num-seqs 64 \
+    --quantization "wint4" \
+    --engine-worker-queue-port $((port_num + 10)) \
+    --metrics-port $((port_num + 2)) \
+    --cache-queue-port $((port_num + 47873)) \
+    --gpu-memory-utilization 0.9 \
+    --load-choices "default" > server.log 2>&1 &

-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
+sleep 60
+# 探活
+TIMEOUT=$((15 * 60))
+INTERVAL=10
+ENDPOINT="http://0.0.0.0:${port_num}/health"
+START_TIME=$(date +%s)
+echo "开始服务健康检查，最长等待时间：${TIMEOUT}秒"
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo -e "\n服务启动超时：经过 $((TIMEOUT/60)) 分钟服务仍未启动！"
+        stop_processes
+        cat server.log
+        echo "log/workerlog.0"
+        cat log/workerlog.0
+        exit 1
+    fi
+    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -m 2 "$ENDPOINT" || true)
+    echo -e "\r服务健康检查中... 已等待 ${ELAPSED} 秒，当前状态码：${HTTP_CODE}"
+    if [ "$HTTP_CODE" = "200" ]; then
+        echo -e "\n服务启动成功！耗时 ${ELAPSED} 秒"
+        break
+    else
+        sleep $INTERVAL
+    fi
+done
+
+cat server.log
+
+# 执行在线推理验证脚本
+python tests/ci_use/XPU_45T/run_ep_online.py
+ep_online_exit_code=$?
+echo ep_online_exit_code is ${ep_online_exit_code}

 unset BKCL_ENABLE_XDR
 unset BKCL_RDMA_NICS
@@ -369,58 +548,10 @@ unset BKCL_PCIE_RING
 unset XSHMEM_MODE
 unset XSHMEM_QP_NUM_PER_RANK
 unset BKCL_RDMA_VERBS
-unset enable_expert_parallel
-unset enable_tensor_parallel
-unset disable_sequence_parallel_moe
 stop_processes

-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
+if [ ${ep_online_exit_code} -ne 0 ]; then
    cat log/workerlog.0
-    echo "EP8TP8 allreduce 相关测试失败，请检查pr代码"
-    exit 1
-fi
-
-
-echo "============================开始 EP8TP8 all2all 测试!============================"
-sleep 5
-rm -rf log/*
-rm -f core*
-ipcrm --all=msg
-xpu-smi
-if [[ "$GPU_ID" == "0" ]]; then
-    export XPU_VISIBLE_DEVICES="0,1,2,3"
-else
-    export XPU_VISIBLE_DEVICES="4,5,6,7"
-fi
-export BKCL_ENABLE_XDR=1
-export BKCL_RDMA_NICS=xgbe1,xgbe2,xgbe3,xgbe4
-export BKCL_TRACE_TOPO=1
-export BKCL_PCIE_RING=1
-export XSHMEM_MODE=1
-export XSHMEM_QP_NUM_PER_RANK=32
-export BKCL_RDMA_VERBS=1
-
-export enable_expert_parallel=1
-export enable_tensor_parallel=1
-
-python -m pytest -s --timeout=600 tests/ci_use/XPU_45T/run_ep.py
-ep_exit_code=$?
-
-unset BKCL_ENABLE_XDR
-unset BKCL_RDMA_NICS
-unset BKCL_TRACE_TOPO
-unset BKCL_PCIE_RING
-unset XSHMEM_MODE
-unset XSHMEM_QP_NUM_PER_RANK
-unset BKCL_RDMA_VERBS
-unset enable_expert_parallel
-unset enable_tensor_parallel
-stop_processes
-
-if [ ${ep_exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
-    cat log/workerlog.0
-    echo "EP8TP8 all2all 相关测试失败，请检查pr代码"
+    echo "EP4TP4 all2all 在线服务相关测试失败，请检查pr代码"
    exit 1
 fi
--- a/tests/ci_use/XPU_45T/run_45T.py
+++ b/tests/ci_use/XPU_45T/run_45T.py
@@ -36,7 +36,7 @@ def test_45t():
    )
    print(response.choices[0].message.content)
    # print(base_response)
-    assert "人工智能" in response.choices[0].message.content
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])


 if __name__ == "__main__":
--- a/tests/ci_use/XPU_45T/run_ep_online.py
+++ b/tests/ci_use/XPU_45T/run_ep_online.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import openai
+
+
+def test_ep():
+    ip = "0.0.0.0"
+    gpu_id = int(os.getenv("GPU_ID", "0"))
+    service_http_port = 8188 + gpu_id * 100  # 服务配置的
+    client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+    # 非流式对话
+    response = client.chat.completions.create(
+        model="default",
+        messages=[
+            {"role": "user", "content": "你好，你是谁？"},
+        ],
+        temperature=1,
+        top_p=0,
+        max_tokens=64,
+        stream=False,
+    )
+
+    print(response.choices[0].message.content)
+    # print(base_response)
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
+
+
+if __name__ == "__main__":
+    test_ep()
--- a/tests/ci_use/XPU_45T/run_w4a8.py
+++ b/tests/ci_use/XPU_45T/run_w4a8.py
@@ -36,7 +36,7 @@ def test_w4a8():
    )
    print(response.choices[0].message.content)
    # print(base_response)
-    assert "人工智能" in response.choices[0].message.content
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])


 if __name__ == "__main__":