mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[CI] Allow occasional distributed worker exit_code (#5341)
This commit is contained in:
@@ -47,7 +47,7 @@ for subdir in "$run_path"*/; do
|
||||
if [ "$exit_code" -eq 1 ] || [ "$exit_code" -eq 124 ]; then
|
||||
echo "[ERROR] $file 起服务或执行异常,exit_code=$exit_code"
|
||||
if [ "$exit_code" -eq 124 ]; then
|
||||
echo "[TIMEOUT] $file 脚本执行超过 6 分钟, 任务超时退出!"
|
||||
echo "[TIMEOUT] $file 脚本执行超过 10 分钟, 任务超时退出!"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
@@ -63,4 +63,4 @@ def test_rollout_model_with_distributed_launch():
|
||||
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
||||
print(stderr)
|
||||
|
||||
assert return_code != 1, f"Process exited with code {return_code}"
|
||||
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||
|
||||
@@ -43,7 +43,7 @@ def test_fused_moe_launch():
|
||||
stdout, stderr = process.communicate()
|
||||
return_code = -1
|
||||
print(f"std_out: {stdout}")
|
||||
assert return_code != 1, f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
|
||||
assert return_code in (0, 250), f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
|
||||
|
||||
|
||||
test_fused_moe_launch()
|
||||
|
||||
@@ -42,7 +42,7 @@ def test_custom_all_reduce_launch():
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
return_code = -1
|
||||
assert return_code == 0, f"Process exited with code {return_code}"
|
||||
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||
|
||||
|
||||
test_custom_all_reduce_launch()
|
||||
|
||||
@@ -65,4 +65,4 @@ def test_rollout_model_with_distributed_launch():
|
||||
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
||||
print(stderr)
|
||||
|
||||
assert return_code != 1, f"Process exited with code {return_code}"
|
||||
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||
|
||||
Reference in New Issue
Block a user