From 9ff99d2b0392bc3d024b89285c2803e25177eddf Mon Sep 17 00:00:00 2001 From: Yonghua Li <39643373+liyonghua0910@users.noreply.github.com> Date: Tue, 23 Dec 2025 17:51:35 +0800 Subject: [PATCH] [BugFix] fix double shutdown of comm group when rank0 clears weights slower than other ranks (#5710) --- fastdeploy/rl/dynamic_weight_manager.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fastdeploy/rl/dynamic_weight_manager.py b/fastdeploy/rl/dynamic_weight_manager.py index bee87de3b..cbee0f990 100644 --- a/fastdeploy/rl/dynamic_weight_manager.py +++ b/fastdeploy/rl/dynamic_weight_manager.py @@ -281,10 +281,14 @@ class DynamicWeightManager: logger.info("infer engine stopped! start to load new checkpoint...") model_runner.clear_requests() model_runner.update_parameters(pid) + while model_weights_status.value[0] != ModelWeightsStatus.NORMAL: + time.sleep(0.01) logger.info("finished loading new checkpoint") elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING: logger.info("infer engine stopped! start to clear checkpoint...") model_runner.clear_requests() model_runner.clear_parameters(pid) + while model_weights_status.value[0] != ModelWeightsStatus.CLEARED: + time.sleep(0.01) logger.info("finished clearing checkpoint") time.sleep(0.01)