From dfeabee123d718e2253efbc1ee580fae8709cc37 Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Wed, 3 Dec 2025 10:56:59 +0800 Subject: [PATCH] [CI] Allow occasional distributed worker exit_code (#5341) --- scripts/run_pre_ce.sh | 2 +- tests/ci_use/GLM-45-AIR/test_rollout_model.py | 2 +- tests/distributed/test_chunked_moe.py | 2 +- tests/distributed/test_custom_all_reduce.py | 2 +- tests/e2e/EB_VL_Lite/test_rollout_model.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index e109dbcfa..aac8e404d 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -47,7 +47,7 @@ for subdir in "$run_path"*/; do if [ "$exit_code" -eq 1 ] || [ "$exit_code" -eq 124 ]; then echo "[ERROR] $file 起服务或执行异常,exit_code=$exit_code" if [ "$exit_code" -eq 124 ]; then - echo "[TIMEOUT] $file 脚本执行超过 6 分钟, 任务超时退出!" + echo "[TIMEOUT] $file 脚本执行超过 10 分钟, 任务超时退出!" fi fi diff --git a/tests/ci_use/GLM-45-AIR/test_rollout_model.py b/tests/ci_use/GLM-45-AIR/test_rollout_model.py index a3f279118..71c94f66a 100644 --- a/tests/ci_use/GLM-45-AIR/test_rollout_model.py +++ b/tests/ci_use/GLM-45-AIR/test_rollout_model.py @@ -63,4 +63,4 @@ def test_rollout_model_with_distributed_launch(): print("\n" + "=" * 50 + " STDERR " + "=" * 50) print(stderr) - assert return_code != 1, f"Process exited with code {return_code}" + assert return_code in (0, 250), f"Process exited with code {return_code}" diff --git a/tests/distributed/test_chunked_moe.py b/tests/distributed/test_chunked_moe.py index ab610605b..51d64675e 100644 --- a/tests/distributed/test_chunked_moe.py +++ b/tests/distributed/test_chunked_moe.py @@ -43,7 +43,7 @@ def test_fused_moe_launch(): stdout, stderr = process.communicate() return_code = -1 print(f"std_out: {stdout}") - assert return_code != 1, f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}" + assert return_code in (0, 250), f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}" test_fused_moe_launch() diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index f468ee8f7..a0eda233d 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -42,7 +42,7 @@ def test_custom_all_reduce_launch(): process.kill() stdout, stderr = process.communicate() return_code = -1 - assert return_code == 0, f"Process exited with code {return_code}" + assert return_code in (0, 250), f"Process exited with code {return_code}" test_custom_all_reduce_launch() diff --git a/tests/e2e/EB_VL_Lite/test_rollout_model.py b/tests/e2e/EB_VL_Lite/test_rollout_model.py index 3ce94171b..de9692c4d 100644 --- a/tests/e2e/EB_VL_Lite/test_rollout_model.py +++ b/tests/e2e/EB_VL_Lite/test_rollout_model.py @@ -65,4 +65,4 @@ def test_rollout_model_with_distributed_launch(): print("\n" + "=" * 50 + " STDERR " + "=" * 50) print(stderr) - assert return_code != 1, f"Process exited with code {return_code}" + assert return_code in (0, 250), f"Process exited with code {return_code}"