mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[CI] Allow occasional distributed worker exit_code (#5341)
This commit is contained in:
@@ -47,7 +47,7 @@ for subdir in "$run_path"*/; do
|
|||||||
if [ "$exit_code" -eq 1 ] || [ "$exit_code" -eq 124 ]; then
|
if [ "$exit_code" -eq 1 ] || [ "$exit_code" -eq 124 ]; then
|
||||||
echo "[ERROR] $file 起服务或执行异常,exit_code=$exit_code"
|
echo "[ERROR] $file 起服务或执行异常,exit_code=$exit_code"
|
||||||
if [ "$exit_code" -eq 124 ]; then
|
if [ "$exit_code" -eq 124 ]; then
|
||||||
echo "[TIMEOUT] $file 脚本执行超过 6 分钟, 任务超时退出!"
|
echo "[TIMEOUT] $file 脚本执行超过 10 分钟, 任务超时退出!"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -63,4 +63,4 @@ def test_rollout_model_with_distributed_launch():
|
|||||||
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
||||||
print(stderr)
|
print(stderr)
|
||||||
|
|
||||||
assert return_code != 1, f"Process exited with code {return_code}"
|
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ def test_fused_moe_launch():
|
|||||||
stdout, stderr = process.communicate()
|
stdout, stderr = process.communicate()
|
||||||
return_code = -1
|
return_code = -1
|
||||||
print(f"std_out: {stdout}")
|
print(f"std_out: {stdout}")
|
||||||
assert return_code != 1, f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
|
assert return_code in (0, 250), f"Process exited with code {return_code}, stdout: {stdout}, stderr: {stderr}"
|
||||||
|
|
||||||
|
|
||||||
test_fused_moe_launch()
|
test_fused_moe_launch()
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ def test_custom_all_reduce_launch():
|
|||||||
process.kill()
|
process.kill()
|
||||||
stdout, stderr = process.communicate()
|
stdout, stderr = process.communicate()
|
||||||
return_code = -1
|
return_code = -1
|
||||||
assert return_code == 0, f"Process exited with code {return_code}"
|
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||||
|
|
||||||
|
|
||||||
test_custom_all_reduce_launch()
|
test_custom_all_reduce_launch()
|
||||||
|
|||||||
@@ -65,4 +65,4 @@ def test_rollout_model_with_distributed_launch():
|
|||||||
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
||||||
print(stderr)
|
print(stderr)
|
||||||
|
|
||||||
assert return_code != 1, f"Process exited with code {return_code}"
|
assert return_code in (0, 250), f"Process exited with code {return_code}"
|
||||||
|
|||||||
Reference in New Issue
Block a user