mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[fix] fix ep group all-reduce (#4140)
* [fix] fix ep group all-reduce * [fix] fix clear/update lock not working when workers > 1 * [chore] add preemption triggered info log * [fix] fix code style * fix model_weights_signal (#4092) * fix model_weights_signal --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
@@ -220,23 +220,17 @@ class DynamicWeightManager:
|
||||
check model weights status
|
||||
"""
|
||||
logger.info(f"dynamic weight manager is check model weights status! {model_weights_status.value[0]}")
|
||||
is_stop = 0
|
||||
while model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
|
||||
if model_weights_status.value[0] == ModelWeightsStatus.UPDATING:
|
||||
logger.info("infer engine stopped! start to load new checkpoint...")
|
||||
model_runner.update_parameters(pid)
|
||||
while model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
|
||||
time.sleep(0.01)
|
||||
logger.info("finished loading new checkpoint")
|
||||
elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING:
|
||||
logger.info("infer engine stopped! start to clear checkpoint...")
|
||||
model_runner.clear_parameters(pid)
|
||||
while True:
|
||||
if model_weights_status.value[0] == ModelWeightsStatus.NORMAL:
|
||||
logger.info("finished loading new checkpoint")
|
||||
break
|
||||
elif is_stop == 1 or (model_weights_status.value[0] == ModelWeightsStatus.CLEARED and is_stop == 0):
|
||||
if is_stop == 0:
|
||||
logger.info("finished clearing checkpoint")
|
||||
is_stop = 1
|
||||
time.sleep(0.001)
|
||||
break
|
||||
else:
|
||||
time.sleep(0.001)
|
||||
while model_weights_status.value[0] != ModelWeightsStatus.CLEARED:
|
||||
time.sleep(0.01)
|
||||
logger.info("finished clearing checkpoint")
|
||||
time.sleep(0.01)
|
||||
|
Reference in New Issue
Block a user