[fix] fix ep group all-reduce (#4140)

* [fix] fix ep group all-reduce

* [fix] fix clear/update lock not working when workers > 1

* [chore] add preemption triggered info log

* [fix] fix code style

* fix model_weights_signal (#4092)

* fix model_weights_signal

---------

Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
李泳桦
2025-09-18 10:34:49 +08:00
committed by GitHub
parent cffde70949
commit 0fa28b1068
6 changed files with 41 additions and 26 deletions

View File

@@ -220,23 +220,17 @@ class DynamicWeightManager:
check model weights status
"""
logger.info(f"dynamic weight manager is check model weights status! {model_weights_status.value[0]}")
is_stop = 0
while model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
if model_weights_status.value[0] == ModelWeightsStatus.UPDATING:
logger.info("infer engine stopped! start to load new checkpoint...")
model_runner.update_parameters(pid)
while model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
time.sleep(0.01)
logger.info("finished loading new checkpoint")
elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING:
logger.info("infer engine stopped! start to clear checkpoint...")
model_runner.clear_parameters(pid)
while True:
if model_weights_status.value[0] == ModelWeightsStatus.NORMAL:
logger.info("finished loading new checkpoint")
break
elif is_stop == 1 or (model_weights_status.value[0] == ModelWeightsStatus.CLEARED and is_stop == 0):
if is_stop == 0:
logger.info("finished clearing checkpoint")
is_stop = 1
time.sleep(0.001)
break
else:
time.sleep(0.001)
while model_weights_status.value[0] != ModelWeightsStatus.CLEARED:
time.sleep(0.01)
logger.info("finished clearing checkpoint")
time.sleep(0.01)