[CI] Allow occasional distributed worker exit_code (#5341)

This commit is contained in:
YuBaoku
2025-12-03 10:56:59 +08:00
committed by GitHub
parent 0eb799a324
commit dfeabee123
5 changed files with 5 additions and 5 deletions

View File

@@ -47,7 +47,7 @@ for subdir in "$run_path"*/; do
if [ "$exit_code" -eq 1 ] || [ "$exit_code" -eq 124 ]; then
echo "[ERROR] $file 起服务或执行异常exit_code=$exit_code"
if [ "$exit_code" -eq 124 ]; then
echo "[TIMEOUT] $file 脚本执行超过 6 分钟, 任务超时退出!"
echo "[TIMEOUT] $file 脚本执行超过 10 分钟, 任务超时退出!"
fi
fi

View File

@@ -63,4 +63,4 @@ def test_rollout_model_with_distributed_launch():
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
print(stderr)
assert return_code != 1, f"Process exited with code {return_code}"
assert return_code in (0, 250), f"Process exited with code {return_code}"

View File

@@ -43,7 +43,7 @@ def test_fused_moe_launch():
stdout, stderr = process.communicate()
return_code = -1
print(f"std_out: {stdout}")
assert return_code != 1, f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
assert return_code in (0, 250), f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
test_fused_moe_launch()

View File

@@ -42,7 +42,7 @@ def test_custom_all_reduce_launch():
process.kill()
stdout, stderr = process.communicate()
return_code = -1
assert return_code == 0, f"Process exited with code {return_code}"
assert return_code in (0, 250), f"Process exited with code {return_code}"
test_custom_all_reduce_launch()

View File

@@ -65,4 +65,4 @@ def test_rollout_model_with_distributed_launch():
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
print(stderr)
assert return_code != 1, f"Process exited with code {return_code}"
assert return_code in (0, 250), f"Process exited with code {return_code}"