[Feature] support eplb in api_server (#4782)

* support eplb in api_server

* update code

* add eplb test case

* update eplb

* support tp+dp eplb

* update test cese

* update code

* update code

* fix bug

* update copilot review

* update test case name
This commit is contained in:
kevin
2025-11-24 20:22:29 +08:00
committed by GitHub
parent d5bd64336a
commit 8e4e3ff510
25 changed files with 2102 additions and 421 deletions

View File

@@ -186,7 +186,6 @@ class ModelConfig:
self.enable_logprob = False
self.max_logprobs = 20
self.logprobs_mode = "raw_logprobs"
self.enable_redundant_experts = False
self.redundant_experts_num = 0
self.seed = 0
self.quantization = None
@@ -1153,20 +1152,54 @@ class EPLBConfig:
def __init__(
self,
args,
):
self.enable_redundant_experts = envs.FD_ENABLE_REDUNDANT_EXPERTS
self.redundant_experts_num = envs.FD_REDUNDANT_EXPERTS_NUM
self.redundant_expert_ip_shm_size = envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE
self.redundant_expert_meta_dir = envs.FD_REDUNDANT_EXPERT_META_DIR
self.redundant_expert_api_user = envs.FD_REDUNDANT_EXPERT_API_USER
self.redundant_expert_api_password = envs.FD_REDUNDANT_EXPERT_API_PASSWORD
self.redundant_expert_eplb_strategy = envs.FD_REDUNDANT_EXPERT_EPLB_STRATEGY
self.redundant_expert_dump_workload_interval = envs.FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL
self.redundant_expert_async_load_model_shmem_size_gb = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
self.redundant_expert_enable_schedule_cordon = envs.FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON
self.model_use_safetensors = envs.FD_MODEL_USE_SAFETENSORS
self.model_use_offline_quant = envs.FD_MODEL_USE_OFFLINE_QUANT
self.moe_quant_type = envs.FD_MOE_QUANT_TYPE
if args is None:
args = {}
# enable eplb
self.enable_eplb: bool = False
# redundant experts num
self.redundant_experts_num: int = 0
# expert ip shm size
self.redundant_expert_ip_shm_size: int = 1024
# expert meta dir
self.redundant_expert_meta_dir: str = "/tmp/redundant_expert_meta"
# expert api user and password
self.redundant_expert_api_user: str = ""
self.redundant_expert_api_password: str = ""
# expert eplb strategy
self.redundant_expert_eplb_strategy: str = ""
# expert dump workload interval
self.redundant_expert_dump_workload_interval: int = 10
# expert async load model shmem size gb
self.redundant_expert_async_load_model_shmem_size_gb: int = 0
# expert enable schedule cordon
self.redundant_expert_enable_schedule_cordon: bool = True
# model use safetensors
self.model_use_safetensors: bool = True
# model use offline quant
self.model_use_offline_quant: bool = True
# moe quant type
self.moe_quant_type: str = "w4a8"
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
def to_json_string(self):
"""
Convert eplb_config to json string.
"""
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
def print(self):
"""
Print all configuration information.
"""
logger.info("EPLB Configuration Information :")
for k, v in self.__dict__.items():
logger.info("{:<20}:{:<6}{}".format(k, "", v))
logger.info("=============================================================")
class CacheConfig:

View File

@@ -467,6 +467,16 @@ class EngineArgs:
Url for router server, such as `0.0.0.0:30000`.
"""
enable_eplb: bool = False
"""
Flag to enable eplb
"""
eplb_config: Optional[Dict[str, Any]] = None
"""
Configuration for eplb.
"""
def __post_init__(self):
"""
Post-initialization processing to set default tokenizer if not provided.
@@ -850,6 +860,18 @@ class EngineArgs:
default=EngineArgs.enable_expert_parallel,
help="Enable expert parallelism.",
)
parallel_group.add_argument(
"--enable-eplb",
action="store_true",
default=EngineArgs.enable_eplb,
help="Enable eplb.",
)
parallel_group.add_argument(
"--eplb-config",
type=json.loads,
default=EngineArgs.eplb_config,
help="Config of eplb.",
)
# Load group
load_group = parser.add_argument_group("Load Configuration")
@@ -1126,7 +1148,7 @@ class EngineArgs:
def create_scheduler_config(self) -> SchedulerConfig:
"""
Create and retuan a SchedulerConfig object based on the current settings.
Create and return a SchedulerConfig object based on the current settings.
"""
prefix = "scheduler_"
prefix_len = len(prefix)
@@ -1173,13 +1195,22 @@ class EngineArgs:
early_stop_args[k] = v
return EarlyStopConfig(early_stop_args)
def create_eplb_config(self) -> EPLBConfig:
"""
Create and retuan an EPLBConfig object based on the current settings.
"""
eplb_args = asdict(self)
if self.eplb_config is not None:
for k, v in self.eplb_config.items():
eplb_args[k] = v
eplb_args["enable_eplb"] = self.enable_eplb
return EPLBConfig(eplb_args)
def create_engine_config(self, port_availability_check=True) -> FDConfig:
"""
Create and return a Config object based on the current settings.
"""
all_dict = asdict(self)
eplb_cfg = EPLBConfig()
all_dict["enable_redundant_experts"] = eplb_cfg.enable_redundant_experts
model_cfg = ModelConfig(all_dict)
# XPU currently disable prefix cache for VL model
@@ -1221,6 +1252,7 @@ class EngineArgs:
scheduler_cfg = self.create_scheduler_config()
graph_opt_cfg = self.create_graph_optimization_config()
plas_attention_config = self.create_plas_attention_config()
eplb_cfg = self.create_eplb_config()
router_config = RouterConfig(all_dict)
early_stop_cfg = self.create_early_stop_config()

View File

@@ -833,6 +833,7 @@ class AsyncLLMEngine:
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
)
worker_store_true_flag = {

View File

@@ -34,6 +34,7 @@ from opentelemetry import trace
from fastdeploy.engine.request import Request, RequestOutput, RequestType
from fastdeploy.engine.resource_manager import ResourceManager
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
from fastdeploy.eplb.utils import init_eplb_signals
from fastdeploy.input.preprocess import InputPreprocessor
from fastdeploy.inter_communicator import (
EngineCacheQueue,
@@ -142,6 +143,12 @@ class EngineService:
)
self._init_worker_monitor_signals()
if self.cfg.eplb_config.enable_eplb:
current_suffix = int(
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
)
init_eplb_signals(cfg, current_suffix)
self._finalizer = weakref.finalize(self, self._exit_sub_services)
def start(self):

View File

@@ -566,6 +566,7 @@ class LLMEngine:
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
)
if self.cfg.structured_outputs_config.logits_processors is not None:
arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}"

View File

@@ -20,20 +20,22 @@ import time
import traceback
import uuid
from copy import copy
from http import HTTPStatus
import numpy as np
from filelock import FileLock
from fastdeploy import envs
from fastdeploy.config import ModelConfig
from fastdeploy.entrypoints.openai.utils import DealerConnectionManager
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
from fastdeploy.eplb.utils import RedundantExpertWorkload
from fastdeploy.input.preprocess import InputPreprocessor
from fastdeploy.inter_communicator import (
IPCSignal,
KVCacheStatus,
ModelWeightsStatus,
PrefixTreeStatus,
RearrangeExpertStatus,
ZmqIpcClient,
)
from fastdeploy.metrics.work_metrics import work_process_metrics
@@ -63,6 +65,7 @@ class EngineClient:
port,
limit_mm_per_prompt,
mm_processor_kwargs,
config,
reasoning_parser=None,
data_parallel_size=1,
enable_logprob=False,
@@ -72,11 +75,12 @@ class EngineClient:
splitwise_role=None,
max_processor_cache=0,
):
model_config = ModelConfig({"model": model_name_or_path})
self.enable_mm = model_config.enable_mm
self.config = config
self.model_config = config.model_config
self.enable_mm = self.model_config.enable_mm
enable_processor_cache = self.enable_mm and max_processor_cache > 0
input_processor = InputPreprocessor(
model_config,
self.model_config,
reasoning_parser,
limit_mm_per_prompt,
mm_processor_kwargs,
@@ -96,13 +100,16 @@ class EngineClient:
is_mm_model_disable_prefix_cache,
)
self.disable_prefix_mm = is_mm_model_disable_prefix_cache(model_config)
self.disable_prefix_mm = is_mm_model_disable_prefix_cache(self.model_config)
if tensor_parallel_size <= max_chips_per_node:
self.is_master = True
else:
self.is_master = False
if self.config.eplb_config.enable_eplb:
self.init_eplb_signals(ipc_signal_suffix=port)
array_size = min(max_chips_per_node, tensor_parallel_size)
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(
@@ -143,6 +150,113 @@ class EngineClient:
self.connection_initialized = False
self.clear_update_lock = FileLock(f"/tmp/fd_weight_clear_update_lock__pid{pid}_port{port}.lock")
def init_eplb_signals(self, ipc_signal_suffix):
"""
Initialize eplb signals.
"""
if self.config.parallel_config.tensor_parallel_rank != 0:
# only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
return
self.signal_clear_experts_token_stats_list = []
self.local_experts_token_stats_array_list = []
self.expert_tokens_stats_array_list = []
self.signal_update_weight_from_disk_array_list = []
self.update_weight_from_disk_result_list = []
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{self.config.parallel_config.local_data_parallel_id}"
rearrange_experts_status = np.zeros([1], dtype=np.int32)
self.rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
self.rearrange_experts_ips_size_signal = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
self.shm_rearrange_experts_ips_list = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=self.config.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=False,
)
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
for tp_rank_id in range(self.config.parallel_config.tensor_parallel_size):
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{tp_rank_id}"
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
self.signal_clear_experts_token_stats_list.append(
IPCSignal(
name="signal_clear_experts_token_stats",
array=signal_clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_disk_array_list.append(
IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.update_weight_from_disk_result_list.append(
IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
experts_token_stats = np.zeros(
(self.config.model_config.num_hidden_layers, self.config.model_config.moe_num_experts),
dtype=np.int32,
)
self.expert_tokens_stats_array_list.append(
IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
self.local_experts_token_stats_array_list.append(
IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
def create_zmq_client(self, model, mode):
"""
Create a ZMQ client.
@@ -470,3 +584,199 @@ class EngineClient:
def check_model_weight_status(self):
return self.model_weights_status_signal.value[0] < 0
async def rearrange_experts(self, request_dict: dict):
"""
rearrange experts
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
action = request_dict.get("action", "")
api_server_logger.info(f"redundant_expert: rearrange_experts recv request, action {action}")
if action == "":
# action: start rearrange experts
# params: {'user': 'xxx', 'passwd': 'xxx', 'ips': ['10.54.99.77:8000', '10.54.99.77:8300']}
if self.rearrange_experts_signal.value[0] != RearrangeExpertStatus.FREE.value:
content = {
"code": 1,
"msg": f"rearrange is doing. actual status {self.rearrange_experts_signal.value[0]}, expect status {RearrangeExpertStatus.FREE.value}",
}
status_code = HTTPStatus.BAD_REQUEST
if "ips" not in request_dict and content is None:
content = {"code": 1, "msg": "ips in request is None"}
status_code = HTTPStatus.BAD_REQUEST
if content is not None:
return content, status_code
data_bytes = (";".join(request_dict["ips"])).encode("utf-8")
data_size = len(data_bytes)
if data_size > eplb_config.redundant_expert_ip_shm_size:
content = {
"code": 1,
"msg": f"actual ips size {data_size}, max limit {eplb_config.redundant_expert_ip_shm_size}",
}
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
else:
self.rearrange_experts_ips_size_signal.value[0] = data_size
self.shm_rearrange_experts_ips_list.shm.buf[:data_size] = data_bytes
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
elif action == "recv_expert_weight":
# action: receive global expert workload, and begin update weight from disk
# params: {'user': 'xxx', 'passwd': 'xxx', 'weight': (layers, experts)}
if "data" not in request_dict or not isinstance(request_dict["data"], list):
content = {"code": 1, "msg": "data not in request or data is not a list"}
status_code = HTTPStatus.BAD_REQUEST
else:
weight = np.array(request_dict["data"], dtype=np.int32)
for idx in range(len(self.expert_tokens_stats_array_list)):
self.expert_tokens_stats_array_list[idx].value[:] = weight[:]
self.signal_update_weight_from_disk_array_list[idx].value[0] = 1
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
elif action == "update_weight_from_tensor":
if self.config.scheduler_config.splitwise_role != "prefill" and content is None:
content = {
"code": 1,
"msg": f"actual role {self.config.scheduler_config.splitwise_role}, expect role prefill",
}
status_code = HTTPStatus.BAD_REQUEST
if self.rearrange_experts_signal.value[0] != RearrangeExpertStatus.LOAD_SUCC.value and content is None:
content = {
"code": 1,
"msg": f"actual status {self.rearrange_experts_signal.value[0]}, expect status {RearrangeExpertStatus.LOAD_SUCC.value}",
}
status_code = HTTPStatus.BAD_REQUEST
if content is None:
self.signal_update_weight_from_tensor_array.value[0] = 1
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
else:
content = {"code": 1, "msg": f"invalid action {action}"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
async def get_per_expert_tokens_stats(self, request_dict: dict):
"""
get per expert tokens stats
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if "clear_stat" in request_dict and request_dict["clear_stat"]:
for clear_experts_token_stats in self.signal_clear_experts_token_stats_list:
clear_experts_token_stats.value[0] = 1
local_experts_list = []
for local_experts_token_stats in self.local_experts_token_stats_array_list:
local_experts_list.append(local_experts_token_stats.value.tolist())
content = {"code": 0, "msg": "ok", "data": local_experts_list}
status_code = HTTPStatus.OK
return content, status_code
async def check_redundant(self, request_dict: dict):
"""
check redundant
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
content, status_code = None, HTTPStatus.OK
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
action = request_dict.get("action", "")
if action == "":
status = "unknown"
try:
status = RearrangeExpertStatus(self.rearrange_experts_signal.value[0]).name
except Exception:
# Ignore errors if status cannot be determined; default to "unknown"
pass
content = {"code": 0, "msg": "ok", "status": status}
get_workloads = False if "check_get_workloads" not in request_dict else request_dict["check_get_workloads"]
if get_workloads:
content["data"], content["msg"] = RedundantExpertWorkload(eplb_config.redundant_expert_meta_dir).load()
status_code = HTTPStatus.OK
elif action == "check_load_weight_result":
update_weight_from_disk_list = []
for update_weight_result in self.update_weight_from_disk_result_list:
update_weight_from_disk_list.append(update_weight_result.value[0].tolist())
content = {"code": 0, "msg": "ok", "data": update_weight_from_disk_list}
status_code = HTTPStatus.OK
return content, status_code

View File

@@ -179,6 +179,8 @@ async def lifespan(app: FastAPI):
verification = False
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
engine_client = EngineClient(
model_name_or_path=args.model,
tokenizer=args.tokenizer,
@@ -196,6 +198,7 @@ async def lifespan(app: FastAPI):
enable_prefix_caching=args.enable_prefix_caching,
splitwise_role=args.splitwise_role,
max_processor_cache=args.max_processor_cache,
config=config,
)
await engine_client.connection_manager.initialize()
app.state.dynamic_load_weight = args.dynamic_load_weight
@@ -223,8 +226,6 @@ async def lifespan(app: FastAPI):
args.max_waiting_time,
)
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
embedding_handler = OpenAIServingEmbedding(
engine_client,
app.state.model_handler,
@@ -515,6 +516,36 @@ def clear_load_weight(request: Request) -> Response:
return Response(content="Dynamic Load Weight Disabled.", status_code=404)
@app.post("/rearrange_experts")
async def rearrange_experts(request: Request):
"""
rearrange experts
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.rearrange_experts(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
@app.post("/get_per_expert_tokens_stats")
async def get_per_expert_tokens_stats(request: Request):
"""
get per expert tokens stats
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.get_per_expert_tokens_stats(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
@app.post("/check_redundant")
async def check_redundant(request: Request):
"""
check redundant
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.check_redundant(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
def launch_api_server() -> None:
"""
启动http服务

View File

@@ -351,6 +351,8 @@ def create_model_paths(args: Namespace) -> List[ModelPath]:
async def initialize_engine_client(args: Namespace, pid: int) -> EngineClient:
"""Initialize and configure the engine client."""
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
engine_client = EngineClient(
model_name_or_path=args.model,
tokenizer=args.tokenizer,
@@ -365,6 +367,7 @@ async def initialize_engine_client(args: Namespace, pid: int) -> EngineClient:
enable_logprob=args.enable_logprob,
workers=args.workers,
tool_parser=args.tool_call_parser,
config=config,
)
await engine_client.connection_manager.initialize()

View File

@@ -136,27 +136,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
# API_KEY required for service authentication
"FD_API_KEY": lambda: [] if "FD_API_KEY" not in os.environ else os.environ["FD_API_KEY"].split(","),
# EPLB related
"FD_ENABLE_REDUNDANT_EXPERTS": lambda: int(os.getenv("FD_ENABLE_REDUNDANT_EXPERTS", "0")) == 1,
"FD_REDUNDANT_EXPERTS_NUM": lambda: int(os.getenv("FD_REDUNDANT_EXPERTS_NUM", "0")),
"FD_REDUNDANT_EXPERT_IP_SHM_SIZE": lambda: int(os.getenv("FD_REDUNDANT_EXPERT_IP_SHM_SIZE", "1024")),
"FD_REDUNDANT_EXPERT_META_DIR": lambda: os.getenv("FD_REDUNDANT_EXPERT_META_DIR", "/tmp/redundant_expert_meta"),
"FD_REDUNDANT_EXPERT_API_USER": lambda: os.getenv("FD_REDUNDANT_EXPERT_API_USER", ""),
"FD_REDUNDANT_EXPERT_API_PASSWORD": lambda: os.getenv("FD_REDUNDANT_EXPERT_API_PASSWORD", ""),
"FD_REDUNDANT_EXPERT_EPLB_STRATEGY": lambda: os.getenv("FD_REDUNDANT_EXPERT_EPLB_STRATEGY", ""),
"FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL", "10")
),
"FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB", "0")
),
"FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON", "1")
)
== 1,
"FD_MODEL_USE_SAFETENSORS": lambda: int(os.getenv("FD_MODEL_USE_SAFETENSORS", "1")) == 1,
"FD_MODEL_USE_OFFLINE_QUANT": lambda: int(os.getenv("FD_MODEL_USE_OFFLINE_QUANT", "1")) == 1,
"FD_MOE_QUANT_TYPE": lambda: os.getenv("FD_MOE_QUANT_TYPE", "w4a8"),
# The AK of bos storing the features while multi_modal infer
"ENCODE_FEATURE_BOS_AK": lambda: os.getenv("ENCODE_FEATURE_BOS_AK"),
# The SK of bos storing the features while multi_modal infer

View File

@@ -1,3 +1,15 @@
""" "
Expert Parallelism Load Balancer (EPLB)
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -1,4 +1,18 @@
"""AsyncExpertLoader async load the model weights of the MoE experts."""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import ctypes
import os
@@ -8,8 +22,9 @@ from typing import List, Tuple
import numpy as np
import paddle
from cuda import cudart
from fastdeploy import envs
from fastdeploy.config import EPLBConfig
REARRANGE_EXPERT_MAGIC_NUM = 147183647
REARRANGE_ORIGINATOR_EP_RANK = 0
@@ -17,7 +32,6 @@ CHECK_TIME_INTERNAL = 3
HTTP_RETRY_NUM = 5
CHECK_TIMEOUT = 120
libc = ctypes.CDLL(None)
libc.mmap.argtypes = [
@@ -45,22 +59,19 @@ MAIN_MODEL_REDUNDANT_SHM_SIZE = 5
MODEL_MAIN_NAME = "eplb_main"
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, logger=None):
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, eplb_config: EPLBConfig, logger=None):
"""create_mmap"""
flags = MAP_SHARED
prot = PROT_READ | PROT_WRITE
main_size = 0
if envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB == 0:
if eplb_config.redundant_expert_async_load_model_shmem_size_gb == 0:
main_size = TOTAL_MODEL_SIZE // ep_size
else:
main_size = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
main_size = eplb_config.redundant_expert_async_load_model_shmem_size_gb
main_size = main_size * G
mmap_infos = {}
from cuda import cudart
for name in model_name:
expert_weight_file = f"/dev/shm/{name}_rank_{ep_rank}_expert_weight_{shm_uuid}"
shm_size = main_size
@@ -70,10 +81,7 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
shm_fd = os.open(expert_weight_file, os.O_RDWR)
os.ftruncate(shm_fd, shm_size)
if logger is not None:
logger.info(
f"redundant_expert: create_mmap file {expert_weight_file}, \
fd {shm_fd}, size {shm_size}"
)
logger.info(f"redundant_expert: create_mmap file {expert_weight_file}, fd {shm_fd}, size {shm_size}")
shm_ptr = libc.mmap(0, ctypes.c_size_t(shm_size), prot, flags, shm_fd, 0)
if shm_ptr == MAP_FAILED:
@@ -86,8 +94,8 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
(ret,) = cudart.cudaHostRegister(addr, shm_size, 0)
if ret != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)},"
+ f" address {hex(addr)} size {shm_size}, ret: {ret}"
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)}, "
f" address {hex(addr)} size {shm_size}, ret: {ret}"
)
mmap_infos[name] = shm_ptr
@@ -173,6 +181,7 @@ class AsyncEPLoader(object):
def __init__(
self,
model_dir,
eplb_config,
rank=8,
expert_per_rank=8,
moe_layer_start_index=3,
@@ -183,6 +192,7 @@ class AsyncEPLoader(object):
__init__
"""
self.model_path = model_dir
self.eplb_config = eplb_config
self.expert_per_rank = expert_per_rank
self.moe_layer_start_index = moe_layer_start_index
@@ -239,7 +249,7 @@ class AsyncEPLoader(object):
succ = True
message = ""
if len(need_to_reload) > 0:
if envs.FD_MODEL_USE_SAFETENSORS:
if self.eplb_config.model_use_safetensors:
succ, message = self.load_safetensor_fp8_from_disk(need_to_reload)
else:
succ, message = self.load_weight_bf16_from_disk(need_to_reload)
@@ -278,7 +288,7 @@ class AsyncEPLoader(object):
# self.logger.info(f"redundant_expert: {file_name} not exist.")
continue
# self.logger.info(f"redundant_expert: Loading expert weights: {file_name}.")
self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
# self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
paddle.set_device(last_device)
self.logger.info("redundant_expert: Loading expert weights end.")
@@ -343,7 +353,15 @@ def load_ep_checkpoint(model_path):
def load_model_weights_process(
rank: int, expert_per_rank: int, moe_layer_start_index: int, moe_quant_type: str, data_conn, mg_conn, shm_uuid
rank: int,
model_dir: str,
expert_per_rank: int,
moe_layer_start_index: int,
moe_quant_type: str,
shm_uuid: str,
eplb_config: EPLBConfig,
data_conn,
mg_conn,
):
"""
load_model_weights_process
@@ -354,18 +372,20 @@ def load_model_weights_process(
setproctitle(f"eplb::async_load_model_{rank}")
faulthandler.enable()
from server.utils import get_logger
from fastdeploy.utils import get_logger
logger = get_logger("eplb_async_loader", "eplb_{0}.log".format(rank))
logger.info("redundant_expert: load_model_weights_process start")
paddle.set_device("cpu")
ep_loader = AsyncEPLoader(
model_dir=model_dir,
rank=rank,
expert_per_rank=expert_per_rank,
moe_layer_start_index=moe_layer_start_index,
moe_quant_type=moe_quant_type,
logger=logger,
eplb_config=eplb_config,
)
while True:

View File

@@ -1,4 +1,18 @@
"""Expert Parallelism Load Balancer (EPLB)"""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Tuple
@@ -9,11 +23,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
"""
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
are as balanced as possible.
Parameters:
weight: [X, n], the weight of each item
num_packs: number of packs
Returns:
pack_index: [X, n], the pack index of each item
rank_in_pack: [X, n], the rank of the item in the pack
@@ -49,11 +61,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
Parameters:
weight: [X, num_log]
num_phy: total number of experts after replication
Returns:
phy2log: [X, num_phy], logical expert id of each physical expert
rank: [X, num_phy], the replica rank
@@ -88,7 +98,6 @@ def rebalance_experts_intra_node(
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [num_moe_layers, num_physical_experts]
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
@@ -155,7 +164,6 @@ def rebalance_experts_hierarchical(
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [num_moe_layers, num_physical_experts]
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
@@ -215,14 +223,12 @@ def rebalance_experts(
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics for all logical experts
num_replicas: number of physical experts, must be a multiple of `num_gpus`
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
@@ -267,9 +273,6 @@ def main():
num_nodes = 4
num_gpus = 4 * 8
# model_tokens_per_expert_stats_list = np.ones(
# (num_hidden_layers, num_expert), dtype=int)
model_tokens_per_expert_stats_list = np.random.randint(low=1, high=10, size=(num_hidden_layers, num_expert))
phy2log, phyrank, logcnt = rebalance_experts(

View File

@@ -1,19 +1,33 @@
"""
redundant expert manger
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import threading
import time
from http import HTTPStatus
from multiprocessing import Pipe, Process, shared_memory
from multiprocessing import Pipe, Process
import numpy as np
import requests
from fastdeploy.config import FDConfig
from fastdeploy.eplb.async_expert_loader import load_model_weights_process
from fastdeploy.eplb.eplb import rebalance_experts
from fastdeploy.eplb.utils import RearrangeExpertState, RedundantExpertWorkload
from fastdeploy.utils import envs, get_logger
from fastdeploy.eplb.utils import RedundantExpertWorkload
from fastdeploy.inter_communicator import IPCSignal, RearrangeExpertStatus
from fastdeploy.utils import get_logger
class RedundantExpertManager:
@@ -21,7 +35,13 @@ class RedundantExpertManager:
RedundantExpertManger
"""
def __init__(self, rank=0, ep_size=64, fd_config=None):
def __init__(
self,
rank: int = 0,
ep_size: int = 32,
fd_config: FDConfig = None,
ipc_signal_suffix: int = 0,
):
self.logger = get_logger("eplb_expert_manager", "eplb_{0}.log".format(rank))
self.rank = rank
@@ -30,9 +50,11 @@ class RedundantExpertManager:
self.eplb_config = fd_config.eplb_config
self.api_user = self.eplb_config.redundant_expert_api_user
self.api_passwd = self.eplb_config.redundant_expert_api_password
self.num_hidden_layers = self.eplb_config.model_config.num_layers
self.num_logical_experts = self.eplb_config.model_config.moe_num_experts
self.num_redundant_experts = self.eplb_config.redundant_experts_num
self.num_hidden_layers = self.fd_config.model_config.num_hidden_layers
self.num_logical_experts = self.fd_config.model_config.moe_num_experts
self.ipc_signal_suffix = ipc_signal_suffix
self.local_rank = self.rank % self.fd_config.parallel_config.tensor_parallel_size
self.num_replicas = self.num_logical_experts + self.num_redundant_experts
self.num_groups = self.num_logical_experts
@@ -112,9 +134,12 @@ class RedundantExpertManager:
name=f"eplb::async_load_model_{rank}",
args=(
self.rank,
self.fd_config.model_config.model,
self.expert_per_rank,
self.fd_config.model_config.moe_layer_start_index,
self.eplb_config.moe_quant_type,
self.ipc_signal_suffix,
self.eplb_config,
child_data_conn,
child_mg_conn,
),
@@ -130,9 +155,6 @@ class RedundantExpertManager:
strategy {self.eplb_config.redundant_expert_eplb_strategy}"
)
def get_unique_name(self, name):
return f"{envs.get_unique_name(name + '_dprank_' + str(self.rank))}"
def get_ep_rank_to_expert_id_list(self):
"""
get_ep_rank_to_expert_id_list
@@ -147,66 +169,84 @@ class RedundantExpertManager:
"""
listen_rearrange_expert_signal
"""
if self.rank == 0:
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
dp_ipc_signal_suffix = f"{self.ipc_signal_suffix}_dp{self.fd_config.parallel_config.local_data_parallel_id}"
if self.local_rank == 0:
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
rearrange_experts_ips_size_signal = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
size=rearrange_experts_ips_size.nbytes,
name=self.get_unique_name("rearrange_experts_ips_size"),
)
rearrange_experts_ips_size_array = np.ndarray(
rearrange_experts_ips_size.shape,
dtype=rearrange_experts_ips_size.dtype,
buffer=shm_rearrange_experts_ips_size.buf,
)
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
shm_rearrange_experts_ips_list = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=self.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=False,
size=1024,
name=self.get_unique_name("rearrange_experts_ips_list"),
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
size=rearrange_experts_status.nbytes,
name=self.get_unique_name("rearrange_experts_status"),
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape,
dtype=rearrange_experts_status.dtype,
buffer=shm_rearrange_experts_status.buf,
)
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{self.local_rank}"
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
signal_update_weight_from_disk_array = IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
size=signal_update_weight_from_disk.nbytes,
name=self.get_unique_name("signal_update_weight_from_disk"),
)
signal_update_weight_from_disk_array = np.ndarray(
signal_update_weight_from_disk.shape,
dtype=signal_update_weight_from_disk.dtype,
buffer=shm_signal_update_weight_from_disk.buf,
)
experts_token_stats = np.zeros((self.num_hidden_layers, 64), dtype=np.int32)
shm_all_experts_token_stats = shared_memory.SharedMemory(
experts_token_stats = np.zeros(
(self.fd_config.model_config.num_hidden_layers, self.fd_config.model_config.moe_num_experts),
dtype=np.int32,
)
shm_all_experts_token_stats = IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.update_weight_from_disk_result = IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
size=experts_token_stats.nbytes,
name=self.get_unique_name("all_experts_token_stats"),
)
while True:
if self.rank == 0:
if self.local_rank == 0:
now = int(time.time())
if rearrange_experts_ips_size_array[0] > 0:
if rearrange_experts_ips_size_signal.value[0] > 0:
# step 1. all reduce experts token stats
address = bytes(shm_rearrange_experts_ips_list.buf[: rearrange_experts_ips_size_array[0]]).decode(
"utf-8"
)
address = bytes(
shm_rearrange_experts_ips_list.shm.buf[: rearrange_experts_ips_size_signal.value[0]]
).decode("utf-8")
self.logger.info(f"redundant_expert: all rank ips {address}")
rearrange_experts_ips_size_array[0] = 0
rearrange_experts_status_array[0] = RearrangeExpertState.doing.value
rearrange_experts_ips_size_signal.value[0] = 0
rearrange_experts_signal.value[0] = RearrangeExpertStatus.DOING.value
self.dp_rank_address = address.strip().split(";")
if self.allreduce_experts_stat():
@@ -214,30 +254,25 @@ class RedundantExpertManager:
self.load_weight_begin_ts = now
self.logger.info("redundant_expert: all-reduce experts stats success")
else:
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
self.logger.warning("redundant_expert: all-reduce experts stats fail")
elif self.need_allgather_load_weight_result and self.allreduce_load_weight_result():
# step 3. all reduce the result of load weight from disk
self.need_allgather_load_weight_result = False
rearrange_experts_status_array[0] = RearrangeExpertState.load_succ.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.LOAD_SUCC.value
self.rearrange_end_ts = now
if rearrange_experts_status_array[0] > 1 and (
if rearrange_experts_signal.value[0] > 1 and (
now - self.rearrange_end_ts > self.rearrange_reset_interval
):
# reset rearrange status
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
if signal_update_weight_from_disk_array[0] == 1:
if signal_update_weight_from_disk_array.value[0] == 1:
# step 2. async load weight: disk -> memory
expert_token_stats = np.ndarray(
experts_token_stats.shape,
dtype=experts_token_stats.dtype,
buffer=shm_all_experts_token_stats.buf,
)
self.model_tokens_per_expert_stats_list[:] = expert_token_stats[:]
self.model_tokens_per_expert_stats_list[:] = shm_all_experts_token_stats.value[:]
self.caculate_expert_rank_table()
self.update_weight_from_disk()
signal_update_weight_from_disk_array[0] = 0
signal_update_weight_from_disk_array.value[0] = 0
time.sleep(0.5)
def caculate_expert_rank_table(self, is_init=False):
@@ -274,7 +309,7 @@ class RedundantExpertManager:
self.model_expert_id_to_ep_rank_array[..., : logical_to_physical_map.shape[-1]] = logical_to_physical_map[:]
self.model_expert_in_rank_num_list[:] = expert_count[:]
if self.rank == 0:
if self.local_rank == 0:
workload = RedundantExpertWorkload()
workload.tokens_per_expert_stats_list = self.model_tokens_per_expert_stats_list.tolist()
workload.ep_rank_to_expert_id_list = rank_expert_list.tolist()
@@ -287,18 +322,7 @@ class RedundantExpertManager:
update_weight_from_disk
"""
begin_time = time.time()
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
create=False,
size=result_update_weight_from_disk.nbytes,
name=self.get_unique_name("result_update_weight_from_disk"),
)
result_update_weight_from_disk_array = np.ndarray(
result_update_weight_from_disk.shape,
dtype=result_update_weight_from_disk.dtype,
buffer=shm_result_update_weight_from_disk.buf,
)
result_update_weight_from_disk_array[0] = 0
self.update_weight_from_disk_result.value[0] = 0
self.logger.info(f"redundant_expert: update_weight_from_disk send to async process, rank {self.rank}")
self.parent_mg_conn.send(
@@ -312,7 +336,7 @@ class RedundantExpertManager:
self.tensor_infos = response["weights"]
# 更新权重加载结果
result_update_weight_from_disk_array[0] = 1 if response["result"] else -1
self.update_weight_from_disk_result.value[0] = 1 if response["result"] else -1
self.logger.info(
"redundant_expert: update_weight_from_disk end, rank"
+ f" {self.rank} {response['result']}, cost {int(time.time() - begin_time)}s"
@@ -330,8 +354,8 @@ class RedundantExpertManager:
"""
allgather_expert_token_stats
"""
success_count = 0
expert_token_stats = np.zeros((self.num_hidden_layers, self.num_logical_experts), dtype=np.int32)
success_count = 0
for addr in self.dp_rank_address:
try:
# TODO: 请求失败重试
@@ -347,8 +371,10 @@ class RedundantExpertManager:
+ f"addr {addr}, res {res.status_code} {res.json()}"
)
break
for meta_data in res.json()["data"]:
expert_token_stats += np.array(meta_data, dtype=np.int32)
success_count += 1
expert_token_stats += np.array(res.json()["data"], dtype=np.int32)
except Exception as e:
self.logger.error(f"redundant_expert: allgather_expert_token_stats fail. addr {addr}, error {e}")
if success_count == len(self.dp_rank_address):
@@ -426,18 +452,7 @@ class RedundantExpertManager:
or not self.eplb_config.redundant_expert_enable_schedule_cordon
):
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=False,
size=signal_update_weight_from_tensor.nbytes,
name=self.get_unique_name("signal_update_weight_from_tensor"),
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
signal_update_weight_from_tensor_array[0] = 1
self.signal_update_weight_from_tensor_array.value[0] = 1
return True
def allgather_load_weight_result(self):
@@ -465,140 +480,28 @@ class RedundantExpertManager:
+ f"addr {addr}, res {res.status_code} {res.json()}"
)
break
result = res.json()["data"]
result_list = res.json()["data"]
self.logger.info(
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result {result}"
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result_list {result_list}"
)
if result == 1:
success_count += 1
elif result == -1:
fail_count += 1
self.logger.error(
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
)
exist_fail = True
for result in result_list:
if result == 1:
success_count += 1
elif result == -1:
fail_count += 1
self.logger.error(
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
)
exist_fail = True
except Exception as e:
self.logger.error(f"redundant_expert: allgather_load_weight_result error. addr {addr}, error {e}")
if success_count == len(self.dp_rank_address):
self.logger.info("redundant_expert: allgather_load_weight_result all success")
all_success = True
else:
if fail_count > 0:
self.logger.info(
"redundant_expert: allgather_load_weight_result not all ready, "
+ f"succ {success_count} fail {fail_count} total {len(self.dp_rank_address)}"
)
else:
self.logger.info("redundant_expert: allgather_load_weight_result all success")
all_success = True
return all_success, exist_fail
def init_shared_memory_for_eplb_rank0(rank):
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
create=True,
size=rearrange_experts_ips_size.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_ips_size_dprank' + rank)}",
)
rearrange_experts_ips_size_array = np.ndarray(
rearrange_experts_ips_size.shape,
dtype=rearrange_experts_ips_size.dtype,
buffer=shm_rearrange_experts_ips_size.buf,
)
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
create=True,
size=envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE,
name=f"{envs.get_unique_name('rearrange_experts_ips_list_dprank' + rank)}",
)
# 记录专家重排状态
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
create=True,
size=rearrange_experts_status.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_status_dprank' + rank)}",
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape, dtype=rearrange_experts_status.dtype, buffer=shm_rearrange_experts_status.buf
)
# 接收更新权重的信号
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=True,
size=signal_update_weight_from_tensor.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_tensor_dprank' + rank) }",
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
return (
rearrange_experts_ips_size_array,
shm_rearrange_experts_ips_list,
rearrange_experts_status_array,
signal_update_weight_from_tensor_array,
)
def init_shared_memory_for_eplb_each_rank(fd_config, rank):
# 记录专家负载
num_layers = fd_config.model_config.num_hidden_layers
num_experts = fd_config.model_config.moe_num_experts
experts_token_stats = np.zeros((num_layers, num_experts), dtype=np.int32)
shm_local_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=experts_token_stats.nbytes,
name=f"{envs.get_unique_name('local_experts_token_stats_dprank' + rank)}",
)
local_experts_token_stats_array = np.ndarray(
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_local_experts_token_stats.buf
)
# TODO: 全局专家负载状态是一样的节点上的所有DP可以共用一份但需要避免多个DP同时更新
shm_all_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=experts_token_stats.nbytes,
name=f"{envs.get_unique_name('all_experts_token_stats_dprank' + rank)}",
)
expert_tokens_stats_array = np.ndarray(
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_all_experts_token_stats.buf
)
# 接收加载权重的信号
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
create=True,
size=signal_update_weight_from_disk.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_disk_dprank' + rank)}",
)
signal_update_weight_from_disk_array = np.ndarray(
signal_update_weight_from_disk.shape,
dtype=signal_update_weight_from_disk.dtype,
buffer=shm_signal_update_weight_from_disk.buf,
)
# 记录加载权重的结果
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
create=True,
size=result_update_weight_from_disk.nbytes,
name=f"{envs.get_unique_name('result_update_weight_from_disk_dprank' + rank)}",
)
result_update_weight_from_disk_array = np.ndarray(
result_update_weight_from_disk.shape,
dtype=result_update_weight_from_disk.dtype,
buffer=shm_result_update_weight_from_disk.buf,
)
# 接收清零专家负载的信号
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
shm_signal_clear_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=signal_clear_experts_token_stats.nbytes,
name=f"{envs.get_unique_name('signal_clear_experts_token_stats_dprank' + rank)}",
)
signal_clear_experts_token_stats_array = np.ndarray(
signal_clear_experts_token_stats.shape,
dtype=signal_clear_experts_token_stats.dtype,
buffer=shm_signal_clear_experts_token_stats.buf,
)
return (
local_experts_token_stats_array,
expert_tokens_stats_array,
signal_update_weight_from_disk_array,
result_update_weight_from_disk_array,
signal_clear_experts_token_stats_array,
)

View File

@@ -1,9 +1,27 @@
"""eplb utilities"""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import json
import os
import time
from enum import Enum
import numpy as np
from fastdeploy.config import FDConfig
from fastdeploy.inter_communicator import IPCSignal
class RedundantExpertWorkload:
@@ -47,13 +65,101 @@ class RedundantExpertWorkload:
return {}, f"redundant_expert: load file {self.meta_file_name} failed, {e}"
class RearrangeExpertState(Enum):
"""RearrangeExpertState"""
def init_eplb_signals(config: FDConfig, ipc_signal_suffix):
"""
Initialize shared memory to indicate eplb status
"""
if config.parallel_config.tensor_parallel_rank != 0:
# only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
return
free = 0
doing = 1
load_succ = 2 # load weight from disk success
done = 3
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{config.parallel_config.local_data_parallel_id}"
# rearrange_experts_status Record the expert's rearrangement status
rearrange_experts_array = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
# Record all DP rank IPs when receiving expert rearrangement requests
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
_ = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=config.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=True,
)
# Receive signals for updating weights
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
for rank_id in range(config.parallel_config.tensor_parallel_size):
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{rank_id}"
# Record expert workload
experts_token_stats = np.zeros(
(config.model_config.num_hidden_layers, config.model_config.moe_num_experts),
dtype=np.int32,
)
_ = IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
_ = IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
# Receive signals for loading weights
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
# Receive signals for clearing expert loads
clear_experts_token_stats = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_clear_experts_token_stats",
array=clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
if __name__ == "__main__":

View File

@@ -22,6 +22,7 @@ from .ipc_signal_const import (
KVCacheStatus,
ModelWeightsStatus,
PrefixTreeStatus,
RearrangeExpertStatus,
)
from .zmq_client import ZmqIpcClient
from .zmq_server import ZmqIpcServer, ZmqTcpServer
@@ -38,4 +39,5 @@ __all__ = [
"PrefixTreeStatus",
"ModelWeightsStatus",
"KVCacheStatus",
"RearrangeExpertStatus",
]

View File

@@ -55,10 +55,11 @@ class IPCSignal:
def __init__(
self,
name: str,
array: np.ndarray,
dtype: np.dtype,
array: np.ndarray = None,
dtype: np.dtype = None,
suffix: int = None,
create: bool = True,
shm_size: int = None,
) -> None:
"""Initialize or connect to a shared memory block.
@@ -68,29 +69,45 @@ class IPCSignal:
dtype: Data type of the array (must match array.dtype).
suffix: Suffix number that will be appended to the name.
create: If True, creates new memory block; otherwise connects to existing.
shm_size: Size of the shared memory block in bytes.
Raises:
AssertionError: If create=True but memory already exists, or dtype mismatch.
"""
assert isinstance(array, np.ndarray), "Input must be a numpy array"
assert dtype == array.dtype, "Specified dtype must match array dtype"
# Set a suffix for name to avoid name conflict while there are multiple engine launched
if suffix is not None:
name = name + f".{suffix}"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=array.nbytes, name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
self.value[:] = array # Initialize with input array data
if dtype is None or array is None:
assert shm_size is not None, "shm_size must be specified if array and dtype are None"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=shm_size, name=name)
self.value = None
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value = None
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
assert isinstance(array, np.ndarray), "Input must be a numpy array"
assert dtype == array.dtype, "Specified dtype must match array dtype"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=array.nbytes, name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
self.value[:] = array # Initialize with input array data
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
def clear(self) -> None:
"""Release system resources and unlink the shared memory block."""

View File

@@ -1,4 +1,21 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from dataclasses import dataclass
from enum import Enum
@dataclass
@@ -30,3 +47,10 @@ class ExistTaskStatus:
EMPTY = 0
EXIST = 1
REFUSE = 2
class RearrangeExpertStatus(Enum):
FREE = 0
DOING = 1
LOAD_SUCC = 2 # load weight from disk success
DONE = 3

View File

@@ -368,7 +368,7 @@ class Ernie4_5_Model(nn.Layer):
fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.fd_config = fd_config
self.redundant_table_manger = None
if fd_config.model_config.enable_redundant_experts is True:
if fd_config.eplb_config.enable_eplb is True:
self.redundant_table_manger = RedundantExpertManger(
n_routed_experts=fd_config.model_config.moe_num_experts,
num_hidden_layers=fd_config.model_config.num_hidden_layers,

View File

@@ -64,6 +64,7 @@ class RolloutModelConfig:
plas_attention_config: str = None,
data_parallel_size: int = 1,
num_nextn_predict_layers: int = 0,
eplb_config: str = {},
):
# Required parameters
self.model = model_name_or_path
@@ -111,6 +112,7 @@ class RolloutModelConfig:
self.ips = None
self.plas_attention_config = plas_attention_config
self.num_nextn_predict_layers = num_nextn_predict_layers
self.eplb_config = eplb_config
def __str__(self):
return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items())

View File

@@ -18,7 +18,6 @@ import argparse
import json
import os
import time
from multiprocessing import shared_memory
from typing import Tuple
import numpy as np
@@ -49,9 +48,13 @@ from fastdeploy.eplb.async_expert_loader import (
load_tensor_from_shm_mem,
)
from fastdeploy.eplb.experts_manager import RedundantExpertManager
from fastdeploy.eplb.utils import RearrangeExpertState
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
from fastdeploy.inter_communicator import (
ExistTaskStatus,
IPCSignal,
ModelWeightsStatus,
RearrangeExpertStatus,
)
from fastdeploy.model_executor.layers.quantization import parse_quant_config
from fastdeploy.model_executor.utils import v1_loader_support
from fastdeploy.platforms import current_platform
@@ -287,68 +290,122 @@ class PaddleDisWorkerProc:
else:
paddle.distributed.barrier(self.parallel_config.tp_group)
def _init_eplb_signal(self):
if not self.eplb_config.enable_eplb:
return
local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
self.last_dump_expert_workload_ts = 0
self.experts_manager = RedundantExpertManager(
rank=self.local_rank,
ep_size=self.ranks,
fd_config=self.fd_config,
ipc_signal_suffix=self.parallel_config.engine_worker_queue_port,
)
dp_ipc_signal_suffix = (
f"{self.parallel_config.engine_worker_queue_port}_dp{self.parallel_config.local_data_parallel_id}"
)
if local_rank == 0: # master rank0
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
self.rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{local_rank}"
experts_token_stats = np.zeros(
(self.fd_config.model_config.num_hidden_layers, self.fd_config.model_config.moe_num_experts),
dtype=np.int32,
)
self.local_experts_token_stats_array = IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
clear_experts_token_stats = np.zeros([1], dtype=np.int32)
self.signal_clear_experts_token_stats = IPCSignal(
name="signal_clear_experts_token_stats",
array=clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
self.mmap_infos = create_mmap(
[MODEL_MAIN_NAME],
self.local_rank,
self.ranks,
shm_uuid=self.parallel_config.engine_worker_queue_port,
eplb_config=self.eplb_config,
logger=logger,
)
def _run_eplb(self, tp_rank):
"""internal call to run eplb"""
if not self.eplb_config.enable_eplb:
return
rearrange_time = time.time()
# Get expert load
if self.local_experts_token_stats_array.value is not None and (
int(rearrange_time) - self.last_dump_expert_workload_ts
> self.eplb_config.redundant_expert_dump_workload_interval
):
self.last_dump_expert_workload_ts = int(rearrange_time)
clear_stat = False
if self.signal_clear_experts_token_stats.value[0] == 1:
clear_stat = True
self.signal_clear_experts_token_stats.value[0] = 0
(
new_stats_array,
_,
_,
_,
) = self.worker.get_model().redundant_table_manger.get_expert_tokens_stats(clear_stat=clear_stat)
self.local_experts_token_stats_array.value[:] = new_stats_array[:]
elif self.local_experts_token_stats_array.value is None:
logger.warning("redundant_expert: local_experts_token_stats not init")
# All DP synchronously update weights
broadcast_value = 0
if tp_rank == 0 and self.signal_update_weight_from_tensor_array.value[0] == 1:
logger.info("redundant_expert: update_weight_from_tensor broadcast signal")
self.signal_update_weight_from_tensor_array.value[0] = 0
broadcast_value = REARRANGE_EXPERT_MAGIC_NUM
data = paddle.to_tensor([broadcast_value])
paddle.distributed.broadcast(data, 0)
if data[0] == REARRANGE_EXPERT_MAGIC_NUM:
self.update_weights_from_tensor(self.mmap_infos)
logger.info(
f"redundant_expert: update_weight_from_tensor success, cost {(time.time() - rearrange_time)*1000}ms"
)
paddle.distributed.barrier()
if tp_rank == 0:
self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value
logger.info("redundant_expert: done")
def event_loop_normal(self) -> None:
"""Main event loop for Paddle Distributed Workers.
TODO(gongshaotian): support remote calling of functions that control worker.
"""
if self.eplb_config.enable_redundant_experts:
self.last_dump_expert_workload_ts = 0
self.experts_manager = RedundantExpertManager(
rank=self.local_rank, ep_size=self.ranks, fd_config=self.fd_config
)
num_layers = self.fd_config.model_config.num_hidden_layers
num_experts = self.fd_config.model_config.moe_num_experts
expert_token_stats = np.zeros((num_layers, num_experts), dtype=np.int32)
shm_local_experts_token_stats = shared_memory.SharedMemory(
create=False,
size=expert_token_stats.nbytes,
name=f"{envs.get_unique_name('local_experts_token_stats_dprank' + self.local_rank)}",
)
expert_tokens_stats_array = np.ndarray(
expert_token_stats.shape, dtype=expert_token_stats.dtype, buffer=shm_local_experts_token_stats.buf
)
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
shm_signal_clear_experts_token_stats = shared_memory.SharedMemory(
create=False,
size=signal_clear_experts_token_stats.nbytes,
name=f"{envs.get_unique_name('signal_clear_experts_token_stats_dprank' + self.local_rank)}",
)
signal_clear_experts_token_stats_array = np.ndarray(
signal_clear_experts_token_stats.shape,
dtype=signal_clear_experts_token_stats.dtype,
buffer=shm_signal_clear_experts_token_stats.buf,
)
if self.local_rank == 0:
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=False,
size=signal_update_weight_from_tensor.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_tensor_dprank' + self.local_rank)}",
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
create=False,
size=rearrange_experts_status.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_status_dprank' + self.local_rank)}",
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape,
dtype=rearrange_experts_status.dtype,
buffer=shm_rearrange_experts_status.buf,
)
expert_workload_dump_interval = envs.FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL
mmap_infos = create_mmap(
[MODEL_MAIN_NAME], self.local_rank, self.ranks, shm_uuid=os.getenv("SHM_UUID", ""), logger=logger
)
# init eplb signal
self._init_eplb_signal()
tp_size = self.parallel_config.tensor_parallel_size
# Currently, only support single node
self.nnode = int((tp_size + 7) // 8)
@@ -358,44 +415,8 @@ class PaddleDisWorkerProc:
self.model_weights_signal = np.zeros([1], dtype=np.int32)
while True:
if self.eplb_config.enable_redundant_experts:
rearrange_time = time.time()
# 获取专家负载
if expert_tokens_stats_array is not None and (
int(rearrange_time) - self.last_dump_expert_workload_ts > expert_workload_dump_interval
):
self.last_dump_expert_workload_ts = int(rearrange_time)
clear_stat = False
if signal_clear_experts_token_stats_array[0] == 1:
clear_stat = True
signal_clear_experts_token_stats_array[0] = 0
(
new_stats_array,
_,
_,
_,
) = self.worker.get_model().redundant_table_manger.get_expert_tokens_stats(clear_stat=clear_stat)
expert_tokens_stats_array[:] = new_stats_array[:]
elif expert_tokens_stats_array is None:
logger.warning("redundant_expert: expert_tokens_stats_array not init")
# 所有DP同步更新权重
broadcast_value = 0
if self.local_rank == 0 and signal_update_weight_from_tensor_array[0] == 1:
logger.info("redundant_expert: update_weight_from_tensor broadcast signal")
signal_update_weight_from_tensor_array[0] = 0
broadcast_value = REARRANGE_EXPERT_MAGIC_NUM
data = paddle.to_tensor([broadcast_value])
paddle.distributed.broadcast(data, 0)
if data[0] == REARRANGE_EXPERT_MAGIC_NUM:
self.update_weights_from_tensor(mmap_infos)
logger.info(
f"redundant_expert: update_weight_from_tensor success, cost {(time.time() - rearrange_time)*1000}ms"
)
paddle.distributed.barrier()
if self.local_rank == 0:
rearrange_experts_status_array[0] = RearrangeExpertState.done.value
logger.info("redundant_expert: done")
# run eplb
self._run_eplb(tp_rank)
if tp_rank == 0:
if self.model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
self.model_weights_signal[0] = int(self.model_weights_status.value[0])
@@ -842,6 +863,13 @@ def parse_args():
help="FQCNs (Fully Qualified Class Names) of logits processors supported by the service.",
)
parser.add_argument(
"--eplb_config",
type=json.loads,
default=None,
help="EPLB Configuration.",
)
args = parser.parse_args()
return args
@@ -897,7 +925,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
plas_attention_config = PlasAttentionConfig(args.plas_attention_config)
early_stop_config = EarlyStopConfig(args.early_stop_config)
eplb_config = EPLBConfig()
eplb_config = EPLBConfig(args.eplb_config)
structured_outputs_config: StructuredOutputsConfig = StructuredOutputsConfig(args=vars(args))