mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[fix] fix ep group all-reduce (#4140)
* [fix] fix ep group all-reduce * [fix] fix clear/update lock not working when workers > 1 * [chore] add preemption triggered info log * [fix] fix code style * fix model_weights_signal (#4092) * fix model_weights_signal --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
@@ -16,12 +16,12 @@
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
from filelock import FileLock
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import ModelConfig
|
||||
@@ -132,7 +132,7 @@ class EngineClient:
|
||||
pid, max_connections=int(os.getenv("FD_DEALER_CONNECTIONS", 50))
|
||||
)
|
||||
self.connection_initialized = False
|
||||
self.clear_update_lock = threading.Lock()
|
||||
self.clear_update_lock = FileLock(f"/tmp/fd_weight_clear_update_lock__pid{pid}_port{port}.lock")
|
||||
|
||||
def create_zmq_client(self, model, mode):
|
||||
"""
|
||||
@@ -351,7 +351,9 @@ class EngineClient:
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.NORMAL:
|
||||
return True, ""
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.UPDATING:
|
||||
return False, "updating model weight already"
|
||||
return False, "worker is updating model weight already"
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARING:
|
||||
return False, "worker is clearing model weight, cannot update now"
|
||||
|
||||
self.model_weights_status_signal.value[0] = ModelWeightsStatus.UPDATING
|
||||
if self.enable_prefix_caching or self.enable_splitwise:
|
||||
@@ -395,7 +397,9 @@ class EngineClient:
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARED:
|
||||
return True, ""
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.CLEARING:
|
||||
return False, "clearing model weight already"
|
||||
return False, "worker is clearing model weight already"
|
||||
if self.model_weights_status_signal.value[0] == ModelWeightsStatus.UPDATING:
|
||||
return False, "worker is updating model weight, cannot clear now"
|
||||
|
||||
self.model_weights_status_signal.value[0] = ModelWeightsStatus.CLEARING
|
||||
if self.enable_prefix_caching or self.enable_splitwise:
|
||||
|
Reference in New Issue
Block a user