[Feature] support eplb in api_server (#4782)

* support eplb in api_server

* update code

* add eplb test case

* update eplb

* support tp+dp eplb

* update test cese

* update code

* update code

* fix bug

* update copilot review

* update test case name
This commit is contained in:
kevin
2025-11-24 20:22:29 +08:00
committed by GitHub
parent d5bd64336a
commit 8e4e3ff510
25 changed files with 2102 additions and 421 deletions

View File

@@ -186,7 +186,6 @@ class ModelConfig:
self.enable_logprob = False
self.max_logprobs = 20
self.logprobs_mode = "raw_logprobs"
self.enable_redundant_experts = False
self.redundant_experts_num = 0
self.seed = 0
self.quantization = None
@@ -1153,20 +1152,54 @@ class EPLBConfig:
def __init__(
self,
args,
):
self.enable_redundant_experts = envs.FD_ENABLE_REDUNDANT_EXPERTS
self.redundant_experts_num = envs.FD_REDUNDANT_EXPERTS_NUM
self.redundant_expert_ip_shm_size = envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE
self.redundant_expert_meta_dir = envs.FD_REDUNDANT_EXPERT_META_DIR
self.redundant_expert_api_user = envs.FD_REDUNDANT_EXPERT_API_USER
self.redundant_expert_api_password = envs.FD_REDUNDANT_EXPERT_API_PASSWORD
self.redundant_expert_eplb_strategy = envs.FD_REDUNDANT_EXPERT_EPLB_STRATEGY
self.redundant_expert_dump_workload_interval = envs.FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL
self.redundant_expert_async_load_model_shmem_size_gb = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
self.redundant_expert_enable_schedule_cordon = envs.FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON
self.model_use_safetensors = envs.FD_MODEL_USE_SAFETENSORS
self.model_use_offline_quant = envs.FD_MODEL_USE_OFFLINE_QUANT
self.moe_quant_type = envs.FD_MOE_QUANT_TYPE
if args is None:
args = {}
# enable eplb
self.enable_eplb: bool = False
# redundant experts num
self.redundant_experts_num: int = 0
# expert ip shm size
self.redundant_expert_ip_shm_size: int = 1024
# expert meta dir
self.redundant_expert_meta_dir: str = "/tmp/redundant_expert_meta"
# expert api user and password
self.redundant_expert_api_user: str = ""
self.redundant_expert_api_password: str = ""
# expert eplb strategy
self.redundant_expert_eplb_strategy: str = ""
# expert dump workload interval
self.redundant_expert_dump_workload_interval: int = 10
# expert async load model shmem size gb
self.redundant_expert_async_load_model_shmem_size_gb: int = 0
# expert enable schedule cordon
self.redundant_expert_enable_schedule_cordon: bool = True
# model use safetensors
self.model_use_safetensors: bool = True
# model use offline quant
self.model_use_offline_quant: bool = True
# moe quant type
self.moe_quant_type: str = "w4a8"
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
def to_json_string(self):
"""
Convert eplb_config to json string.
"""
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
def print(self):
"""
Print all configuration information.
"""
logger.info("EPLB Configuration Information :")
for k, v in self.__dict__.items():
logger.info("{:<20}:{:<6}{}".format(k, "", v))
logger.info("=============================================================")
class CacheConfig:

View File

@@ -467,6 +467,16 @@ class EngineArgs:
Url for router server, such as `0.0.0.0:30000`.
"""
enable_eplb: bool = False
"""
Flag to enable eplb
"""
eplb_config: Optional[Dict[str, Any]] = None
"""
Configuration for eplb.
"""
def __post_init__(self):
"""
Post-initialization processing to set default tokenizer if not provided.
@@ -850,6 +860,18 @@ class EngineArgs:
default=EngineArgs.enable_expert_parallel,
help="Enable expert parallelism.",
)
parallel_group.add_argument(
"--enable-eplb",
action="store_true",
default=EngineArgs.enable_eplb,
help="Enable eplb.",
)
parallel_group.add_argument(
"--eplb-config",
type=json.loads,
default=EngineArgs.eplb_config,
help="Config of eplb.",
)
# Load group
load_group = parser.add_argument_group("Load Configuration")
@@ -1126,7 +1148,7 @@ class EngineArgs:
def create_scheduler_config(self) -> SchedulerConfig:
"""
Create and retuan a SchedulerConfig object based on the current settings.
Create and return a SchedulerConfig object based on the current settings.
"""
prefix = "scheduler_"
prefix_len = len(prefix)
@@ -1173,13 +1195,22 @@ class EngineArgs:
early_stop_args[k] = v
return EarlyStopConfig(early_stop_args)
def create_eplb_config(self) -> EPLBConfig:
"""
Create and retuan an EPLBConfig object based on the current settings.
"""
eplb_args = asdict(self)
if self.eplb_config is not None:
for k, v in self.eplb_config.items():
eplb_args[k] = v
eplb_args["enable_eplb"] = self.enable_eplb
return EPLBConfig(eplb_args)
def create_engine_config(self, port_availability_check=True) -> FDConfig:
"""
Create and return a Config object based on the current settings.
"""
all_dict = asdict(self)
eplb_cfg = EPLBConfig()
all_dict["enable_redundant_experts"] = eplb_cfg.enable_redundant_experts
model_cfg = ModelConfig(all_dict)
# XPU currently disable prefix cache for VL model
@@ -1221,6 +1252,7 @@ class EngineArgs:
scheduler_cfg = self.create_scheduler_config()
graph_opt_cfg = self.create_graph_optimization_config()
plas_attention_config = self.create_plas_attention_config()
eplb_cfg = self.create_eplb_config()
router_config = RouterConfig(all_dict)
early_stop_cfg = self.create_early_stop_config()

View File

@@ -833,6 +833,7 @@ class AsyncLLMEngine:
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
)
worker_store_true_flag = {

View File

@@ -34,6 +34,7 @@ from opentelemetry import trace
from fastdeploy.engine.request import Request, RequestOutput, RequestType
from fastdeploy.engine.resource_manager import ResourceManager
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
from fastdeploy.eplb.utils import init_eplb_signals
from fastdeploy.input.preprocess import InputPreprocessor
from fastdeploy.inter_communicator import (
EngineCacheQueue,
@@ -142,6 +143,12 @@ class EngineService:
)
self._init_worker_monitor_signals()
if self.cfg.eplb_config.enable_eplb:
current_suffix = int(
self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
)
init_eplb_signals(cfg, current_suffix)
self._finalizer = weakref.finalize(self, self._exit_sub_services)
def start(self):

View File

@@ -566,6 +566,7 @@ class LLMEngine:
f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
f" --max_logprobs {self.cfg.model_config.max_logprobs}"
f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
)
if self.cfg.structured_outputs_config.logits_processors is not None:
arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}"

View File

@@ -20,20 +20,22 @@ import time
import traceback
import uuid
from copy import copy
from http import HTTPStatus
import numpy as np
from filelock import FileLock
from fastdeploy import envs
from fastdeploy.config import ModelConfig
from fastdeploy.entrypoints.openai.utils import DealerConnectionManager
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
from fastdeploy.eplb.utils import RedundantExpertWorkload
from fastdeploy.input.preprocess import InputPreprocessor
from fastdeploy.inter_communicator import (
IPCSignal,
KVCacheStatus,
ModelWeightsStatus,
PrefixTreeStatus,
RearrangeExpertStatus,
ZmqIpcClient,
)
from fastdeploy.metrics.work_metrics import work_process_metrics
@@ -63,6 +65,7 @@ class EngineClient:
port,
limit_mm_per_prompt,
mm_processor_kwargs,
config,
reasoning_parser=None,
data_parallel_size=1,
enable_logprob=False,
@@ -72,11 +75,12 @@ class EngineClient:
splitwise_role=None,
max_processor_cache=0,
):
model_config = ModelConfig({"model": model_name_or_path})
self.enable_mm = model_config.enable_mm
self.config = config
self.model_config = config.model_config
self.enable_mm = self.model_config.enable_mm
enable_processor_cache = self.enable_mm and max_processor_cache > 0
input_processor = InputPreprocessor(
model_config,
self.model_config,
reasoning_parser,
limit_mm_per_prompt,
mm_processor_kwargs,
@@ -96,13 +100,16 @@ class EngineClient:
is_mm_model_disable_prefix_cache,
)
self.disable_prefix_mm = is_mm_model_disable_prefix_cache(model_config)
self.disable_prefix_mm = is_mm_model_disable_prefix_cache(self.model_config)
if tensor_parallel_size <= max_chips_per_node:
self.is_master = True
else:
self.is_master = False
if self.config.eplb_config.enable_eplb:
self.init_eplb_signals(ipc_signal_suffix=port)
array_size = min(max_chips_per_node, tensor_parallel_size)
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(
@@ -143,6 +150,113 @@ class EngineClient:
self.connection_initialized = False
self.clear_update_lock = FileLock(f"/tmp/fd_weight_clear_update_lock__pid{pid}_port{port}.lock")
def init_eplb_signals(self, ipc_signal_suffix):
"""
Initialize eplb signals.
"""
if self.config.parallel_config.tensor_parallel_rank != 0:
# only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
return
self.signal_clear_experts_token_stats_list = []
self.local_experts_token_stats_array_list = []
self.expert_tokens_stats_array_list = []
self.signal_update_weight_from_disk_array_list = []
self.update_weight_from_disk_result_list = []
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{self.config.parallel_config.local_data_parallel_id}"
rearrange_experts_status = np.zeros([1], dtype=np.int32)
self.rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
self.rearrange_experts_ips_size_signal = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
self.shm_rearrange_experts_ips_list = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=self.config.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=False,
)
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
for tp_rank_id in range(self.config.parallel_config.tensor_parallel_size):
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{tp_rank_id}"
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
self.signal_clear_experts_token_stats_list.append(
IPCSignal(
name="signal_clear_experts_token_stats",
array=signal_clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_disk_array_list.append(
IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.update_weight_from_disk_result_list.append(
IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
experts_token_stats = np.zeros(
(self.config.model_config.num_hidden_layers, self.config.model_config.moe_num_experts),
dtype=np.int32,
)
self.expert_tokens_stats_array_list.append(
IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
self.local_experts_token_stats_array_list.append(
IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
)
def create_zmq_client(self, model, mode):
"""
Create a ZMQ client.
@@ -470,3 +584,199 @@ class EngineClient:
def check_model_weight_status(self):
return self.model_weights_status_signal.value[0] < 0
async def rearrange_experts(self, request_dict: dict):
"""
rearrange experts
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
action = request_dict.get("action", "")
api_server_logger.info(f"redundant_expert: rearrange_experts recv request, action {action}")
if action == "":
# action: start rearrange experts
# params: {'user': 'xxx', 'passwd': 'xxx', 'ips': ['10.54.99.77:8000', '10.54.99.77:8300']}
if self.rearrange_experts_signal.value[0] != RearrangeExpertStatus.FREE.value:
content = {
"code": 1,
"msg": f"rearrange is doing. actual status {self.rearrange_experts_signal.value[0]}, expect status {RearrangeExpertStatus.FREE.value}",
}
status_code = HTTPStatus.BAD_REQUEST
if "ips" not in request_dict and content is None:
content = {"code": 1, "msg": "ips in request is None"}
status_code = HTTPStatus.BAD_REQUEST
if content is not None:
return content, status_code
data_bytes = (";".join(request_dict["ips"])).encode("utf-8")
data_size = len(data_bytes)
if data_size > eplb_config.redundant_expert_ip_shm_size:
content = {
"code": 1,
"msg": f"actual ips size {data_size}, max limit {eplb_config.redundant_expert_ip_shm_size}",
}
status_code = HTTPStatus.INTERNAL_SERVER_ERROR
else:
self.rearrange_experts_ips_size_signal.value[0] = data_size
self.shm_rearrange_experts_ips_list.shm.buf[:data_size] = data_bytes
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
elif action == "recv_expert_weight":
# action: receive global expert workload, and begin update weight from disk
# params: {'user': 'xxx', 'passwd': 'xxx', 'weight': (layers, experts)}
if "data" not in request_dict or not isinstance(request_dict["data"], list):
content = {"code": 1, "msg": "data not in request or data is not a list"}
status_code = HTTPStatus.BAD_REQUEST
else:
weight = np.array(request_dict["data"], dtype=np.int32)
for idx in range(len(self.expert_tokens_stats_array_list)):
self.expert_tokens_stats_array_list[idx].value[:] = weight[:]
self.signal_update_weight_from_disk_array_list[idx].value[0] = 1
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
elif action == "update_weight_from_tensor":
if self.config.scheduler_config.splitwise_role != "prefill" and content is None:
content = {
"code": 1,
"msg": f"actual role {self.config.scheduler_config.splitwise_role}, expect role prefill",
}
status_code = HTTPStatus.BAD_REQUEST
if self.rearrange_experts_signal.value[0] != RearrangeExpertStatus.LOAD_SUCC.value and content is None:
content = {
"code": 1,
"msg": f"actual status {self.rearrange_experts_signal.value[0]}, expect status {RearrangeExpertStatus.LOAD_SUCC.value}",
}
status_code = HTTPStatus.BAD_REQUEST
if content is None:
self.signal_update_weight_from_tensor_array.value[0] = 1
content = {"code": 0, "msg": "ok"}
status_code = HTTPStatus.OK
return content, status_code
else:
content = {"code": 1, "msg": f"invalid action {action}"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
async def get_per_expert_tokens_stats(self, request_dict: dict):
"""
get per expert tokens stats
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if "clear_stat" in request_dict and request_dict["clear_stat"]:
for clear_experts_token_stats in self.signal_clear_experts_token_stats_list:
clear_experts_token_stats.value[0] = 1
local_experts_list = []
for local_experts_token_stats in self.local_experts_token_stats_array_list:
local_experts_list.append(local_experts_token_stats.value.tolist())
content = {"code": 0, "msg": "ok", "data": local_experts_list}
status_code = HTTPStatus.OK
return content, status_code
async def check_redundant(self, request_dict: dict):
"""
check redundant
Args:
request_dict (dict): request body
Returns:
tuple: response body, status code
"""
content, status_code = None, HTTPStatus.OK
eplb_config = self.config.eplb_config
if not eplb_config.enable_eplb:
content = {"code": 1, "msg": "redundant expert is disabled"}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
if (
request_dict.get("user", "") != eplb_config.redundant_expert_api_user
or request_dict.get("passwd", "") != eplb_config.redundant_expert_api_password
):
content = {"code": 1, "msg": "user or passwd is invalid"}
status_code = HTTPStatus.UNAUTHORIZED
return content, status_code
if self.config.parallel_config.tensor_parallel_rank != 0:
content = {
"code": 1,
"msg": f"actual rank {self.config.parallel_config.tensor_parallel_rank}, expect rank 0",
}
status_code = HTTPStatus.BAD_REQUEST
return content, status_code
action = request_dict.get("action", "")
if action == "":
status = "unknown"
try:
status = RearrangeExpertStatus(self.rearrange_experts_signal.value[0]).name
except Exception:
# Ignore errors if status cannot be determined; default to "unknown"
pass
content = {"code": 0, "msg": "ok", "status": status}
get_workloads = False if "check_get_workloads" not in request_dict else request_dict["check_get_workloads"]
if get_workloads:
content["data"], content["msg"] = RedundantExpertWorkload(eplb_config.redundant_expert_meta_dir).load()
status_code = HTTPStatus.OK
elif action == "check_load_weight_result":
update_weight_from_disk_list = []
for update_weight_result in self.update_weight_from_disk_result_list:
update_weight_from_disk_list.append(update_weight_result.value[0].tolist())
content = {"code": 0, "msg": "ok", "data": update_weight_from_disk_list}
status_code = HTTPStatus.OK
return content, status_code

View File

@@ -179,6 +179,8 @@ async def lifespan(app: FastAPI):
verification = False
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
engine_client = EngineClient(
model_name_or_path=args.model,
tokenizer=args.tokenizer,
@@ -196,6 +198,7 @@ async def lifespan(app: FastAPI):
enable_prefix_caching=args.enable_prefix_caching,
splitwise_role=args.splitwise_role,
max_processor_cache=args.max_processor_cache,
config=config,
)
await engine_client.connection_manager.initialize()
app.state.dynamic_load_weight = args.dynamic_load_weight
@@ -223,8 +226,6 @@ async def lifespan(app: FastAPI):
args.max_waiting_time,
)
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
embedding_handler = OpenAIServingEmbedding(
engine_client,
app.state.model_handler,
@@ -515,6 +516,36 @@ def clear_load_weight(request: Request) -> Response:
return Response(content="Dynamic Load Weight Disabled.", status_code=404)
@app.post("/rearrange_experts")
async def rearrange_experts(request: Request):
"""
rearrange experts
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.rearrange_experts(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
@app.post("/get_per_expert_tokens_stats")
async def get_per_expert_tokens_stats(request: Request):
"""
get per expert tokens stats
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.get_per_expert_tokens_stats(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
@app.post("/check_redundant")
async def check_redundant(request: Request):
"""
check redundant
"""
request_dict = await request.json()
content, status_code = await app.state.engine_client.check_redundant(request_dict=request_dict)
return JSONResponse(content, status_code=status_code)
def launch_api_server() -> None:
"""
启动http服务

View File

@@ -351,6 +351,8 @@ def create_model_paths(args: Namespace) -> List[ModelPath]:
async def initialize_engine_client(args: Namespace, pid: int) -> EngineClient:
"""Initialize and configure the engine client."""
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
engine_client = EngineClient(
model_name_or_path=args.model,
tokenizer=args.tokenizer,
@@ -365,6 +367,7 @@ async def initialize_engine_client(args: Namespace, pid: int) -> EngineClient:
enable_logprob=args.enable_logprob,
workers=args.workers,
tool_parser=args.tool_call_parser,
config=config,
)
await engine_client.connection_manager.initialize()

View File

@@ -136,27 +136,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
# API_KEY required for service authentication
"FD_API_KEY": lambda: [] if "FD_API_KEY" not in os.environ else os.environ["FD_API_KEY"].split(","),
# EPLB related
"FD_ENABLE_REDUNDANT_EXPERTS": lambda: int(os.getenv("FD_ENABLE_REDUNDANT_EXPERTS", "0")) == 1,
"FD_REDUNDANT_EXPERTS_NUM": lambda: int(os.getenv("FD_REDUNDANT_EXPERTS_NUM", "0")),
"FD_REDUNDANT_EXPERT_IP_SHM_SIZE": lambda: int(os.getenv("FD_REDUNDANT_EXPERT_IP_SHM_SIZE", "1024")),
"FD_REDUNDANT_EXPERT_META_DIR": lambda: os.getenv("FD_REDUNDANT_EXPERT_META_DIR", "/tmp/redundant_expert_meta"),
"FD_REDUNDANT_EXPERT_API_USER": lambda: os.getenv("FD_REDUNDANT_EXPERT_API_USER", ""),
"FD_REDUNDANT_EXPERT_API_PASSWORD": lambda: os.getenv("FD_REDUNDANT_EXPERT_API_PASSWORD", ""),
"FD_REDUNDANT_EXPERT_EPLB_STRATEGY": lambda: os.getenv("FD_REDUNDANT_EXPERT_EPLB_STRATEGY", ""),
"FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL", "10")
),
"FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB", "0")
),
"FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON": lambda: int(
os.getenv("FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON", "1")
)
== 1,
"FD_MODEL_USE_SAFETENSORS": lambda: int(os.getenv("FD_MODEL_USE_SAFETENSORS", "1")) == 1,
"FD_MODEL_USE_OFFLINE_QUANT": lambda: int(os.getenv("FD_MODEL_USE_OFFLINE_QUANT", "1")) == 1,
"FD_MOE_QUANT_TYPE": lambda: os.getenv("FD_MOE_QUANT_TYPE", "w4a8"),
# The AK of bos storing the features while multi_modal infer
"ENCODE_FEATURE_BOS_AK": lambda: os.getenv("ENCODE_FEATURE_BOS_AK"),
# The SK of bos storing the features while multi_modal infer

View File

@@ -1,3 +1,15 @@
""" "
Expert Parallelism Load Balancer (EPLB)
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

View File

@@ -1,4 +1,18 @@
"""AsyncExpertLoader async load the model weights of the MoE experts."""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import ctypes
import os
@@ -8,8 +22,9 @@ from typing import List, Tuple
import numpy as np
import paddle
from cuda import cudart
from fastdeploy import envs
from fastdeploy.config import EPLBConfig
REARRANGE_EXPERT_MAGIC_NUM = 147183647
REARRANGE_ORIGINATOR_EP_RANK = 0
@@ -17,7 +32,6 @@ CHECK_TIME_INTERNAL = 3
HTTP_RETRY_NUM = 5
CHECK_TIMEOUT = 120
libc = ctypes.CDLL(None)
libc.mmap.argtypes = [
@@ -45,22 +59,19 @@ MAIN_MODEL_REDUNDANT_SHM_SIZE = 5
MODEL_MAIN_NAME = "eplb_main"
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, logger=None):
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, eplb_config: EPLBConfig, logger=None):
"""create_mmap"""
flags = MAP_SHARED
prot = PROT_READ | PROT_WRITE
main_size = 0
if envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB == 0:
if eplb_config.redundant_expert_async_load_model_shmem_size_gb == 0:
main_size = TOTAL_MODEL_SIZE // ep_size
else:
main_size = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
main_size = eplb_config.redundant_expert_async_load_model_shmem_size_gb
main_size = main_size * G
mmap_infos = {}
from cuda import cudart
for name in model_name:
expert_weight_file = f"/dev/shm/{name}_rank_{ep_rank}_expert_weight_{shm_uuid}"
shm_size = main_size
@@ -70,10 +81,7 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
shm_fd = os.open(expert_weight_file, os.O_RDWR)
os.ftruncate(shm_fd, shm_size)
if logger is not None:
logger.info(
f"redundant_expert: create_mmap file {expert_weight_file}, \
fd {shm_fd}, size {shm_size}"
)
logger.info(f"redundant_expert: create_mmap file {expert_weight_file}, fd {shm_fd}, size {shm_size}")
shm_ptr = libc.mmap(0, ctypes.c_size_t(shm_size), prot, flags, shm_fd, 0)
if shm_ptr == MAP_FAILED:
@@ -86,8 +94,8 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
(ret,) = cudart.cudaHostRegister(addr, shm_size, 0)
if ret != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)},"
+ f" address {hex(addr)} size {shm_size}, ret: {ret}"
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)}, "
f" address {hex(addr)} size {shm_size}, ret: {ret}"
)
mmap_infos[name] = shm_ptr
@@ -173,6 +181,7 @@ class AsyncEPLoader(object):
def __init__(
self,
model_dir,
eplb_config,
rank=8,
expert_per_rank=8,
moe_layer_start_index=3,
@@ -183,6 +192,7 @@ class AsyncEPLoader(object):
__init__
"""
self.model_path = model_dir
self.eplb_config = eplb_config
self.expert_per_rank = expert_per_rank
self.moe_layer_start_index = moe_layer_start_index
@@ -239,7 +249,7 @@ class AsyncEPLoader(object):
succ = True
message = ""
if len(need_to_reload) > 0:
if envs.FD_MODEL_USE_SAFETENSORS:
if self.eplb_config.model_use_safetensors:
succ, message = self.load_safetensor_fp8_from_disk(need_to_reload)
else:
succ, message = self.load_weight_bf16_from_disk(need_to_reload)
@@ -278,7 +288,7 @@ class AsyncEPLoader(object):
# self.logger.info(f"redundant_expert: {file_name} not exist.")
continue
# self.logger.info(f"redundant_expert: Loading expert weights: {file_name}.")
self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
# self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
paddle.set_device(last_device)
self.logger.info("redundant_expert: Loading expert weights end.")
@@ -343,7 +353,15 @@ def load_ep_checkpoint(model_path):
def load_model_weights_process(
rank: int, expert_per_rank: int, moe_layer_start_index: int, moe_quant_type: str, data_conn, mg_conn, shm_uuid
rank: int,
model_dir: str,
expert_per_rank: int,
moe_layer_start_index: int,
moe_quant_type: str,
shm_uuid: str,
eplb_config: EPLBConfig,
data_conn,
mg_conn,
):
"""
load_model_weights_process
@@ -354,18 +372,20 @@ def load_model_weights_process(
setproctitle(f"eplb::async_load_model_{rank}")
faulthandler.enable()
from server.utils import get_logger
from fastdeploy.utils import get_logger
logger = get_logger("eplb_async_loader", "eplb_{0}.log".format(rank))
logger.info("redundant_expert: load_model_weights_process start")
paddle.set_device("cpu")
ep_loader = AsyncEPLoader(
model_dir=model_dir,
rank=rank,
expert_per_rank=expert_per_rank,
moe_layer_start_index=moe_layer_start_index,
moe_quant_type=moe_quant_type,
logger=logger,
eplb_config=eplb_config,
)
while True:

View File

@@ -1,4 +1,18 @@
"""Expert Parallelism Load Balancer (EPLB)"""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Tuple
@@ -9,11 +23,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
"""
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
are as balanced as possible.
Parameters:
weight: [X, n], the weight of each item
num_packs: number of packs
Returns:
pack_index: [X, n], the pack index of each item
rank_in_pack: [X, n], the rank of the item in the pack
@@ -49,11 +61,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
Parameters:
weight: [X, num_log]
num_phy: total number of experts after replication
Returns:
phy2log: [X, num_phy], logical expert id of each physical expert
rank: [X, num_phy], the replica rank
@@ -88,7 +98,6 @@ def rebalance_experts_intra_node(
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [num_moe_layers, num_physical_experts]
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
@@ -155,7 +164,6 @@ def rebalance_experts_hierarchical(
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [num_moe_layers, num_physical_experts]
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
@@ -215,14 +223,12 @@ def rebalance_experts(
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Entry point for expert-parallelism load balancer.
Parameters:
weight: [layers, num_logical_experts], the load statistics for all logical experts
num_replicas: number of physical experts, must be a multiple of `num_gpus`
num_groups: number of expert groups
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
num_gpus: number of GPUs, must be a multiple of `num_nodes`
Returns:
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
@@ -267,9 +273,6 @@ def main():
num_nodes = 4
num_gpus = 4 * 8
# model_tokens_per_expert_stats_list = np.ones(
# (num_hidden_layers, num_expert), dtype=int)
model_tokens_per_expert_stats_list = np.random.randint(low=1, high=10, size=(num_hidden_layers, num_expert))
phy2log, phyrank, logcnt = rebalance_experts(

View File

@@ -1,19 +1,33 @@
"""
redundant expert manger
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import threading
import time
from http import HTTPStatus
from multiprocessing import Pipe, Process, shared_memory
from multiprocessing import Pipe, Process
import numpy as np
import requests
from fastdeploy.config import FDConfig
from fastdeploy.eplb.async_expert_loader import load_model_weights_process
from fastdeploy.eplb.eplb import rebalance_experts
from fastdeploy.eplb.utils import RearrangeExpertState, RedundantExpertWorkload
from fastdeploy.utils import envs, get_logger
from fastdeploy.eplb.utils import RedundantExpertWorkload
from fastdeploy.inter_communicator import IPCSignal, RearrangeExpertStatus
from fastdeploy.utils import get_logger
class RedundantExpertManager:
@@ -21,7 +35,13 @@ class RedundantExpertManager:
RedundantExpertManger
"""
def __init__(self, rank=0, ep_size=64, fd_config=None):
def __init__(
self,
rank: int = 0,
ep_size: int = 32,
fd_config: FDConfig = None,
ipc_signal_suffix: int = 0,
):
self.logger = get_logger("eplb_expert_manager", "eplb_{0}.log".format(rank))
self.rank = rank
@@ -30,9 +50,11 @@ class RedundantExpertManager:
self.eplb_config = fd_config.eplb_config
self.api_user = self.eplb_config.redundant_expert_api_user
self.api_passwd = self.eplb_config.redundant_expert_api_password
self.num_hidden_layers = self.eplb_config.model_config.num_layers
self.num_logical_experts = self.eplb_config.model_config.moe_num_experts
self.num_redundant_experts = self.eplb_config.redundant_experts_num
self.num_hidden_layers = self.fd_config.model_config.num_hidden_layers
self.num_logical_experts = self.fd_config.model_config.moe_num_experts
self.ipc_signal_suffix = ipc_signal_suffix
self.local_rank = self.rank % self.fd_config.parallel_config.tensor_parallel_size
self.num_replicas = self.num_logical_experts + self.num_redundant_experts
self.num_groups = self.num_logical_experts
@@ -112,9 +134,12 @@ class RedundantExpertManager:
name=f"eplb::async_load_model_{rank}",
args=(
self.rank,
self.fd_config.model_config.model,
self.expert_per_rank,
self.fd_config.model_config.moe_layer_start_index,
self.eplb_config.moe_quant_type,
self.ipc_signal_suffix,
self.eplb_config,
child_data_conn,
child_mg_conn,
),
@@ -130,9 +155,6 @@ class RedundantExpertManager:
strategy {self.eplb_config.redundant_expert_eplb_strategy}"
)
def get_unique_name(self, name):
return f"{envs.get_unique_name(name + '_dprank_' + str(self.rank))}"
def get_ep_rank_to_expert_id_list(self):
"""
get_ep_rank_to_expert_id_list
@@ -147,66 +169,84 @@ class RedundantExpertManager:
"""
listen_rearrange_expert_signal
"""
if self.rank == 0:
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
dp_ipc_signal_suffix = f"{self.ipc_signal_suffix}_dp{self.fd_config.parallel_config.local_data_parallel_id}"
if self.local_rank == 0:
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
rearrange_experts_ips_size_signal = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
size=rearrange_experts_ips_size.nbytes,
name=self.get_unique_name("rearrange_experts_ips_size"),
)
rearrange_experts_ips_size_array = np.ndarray(
rearrange_experts_ips_size.shape,
dtype=rearrange_experts_ips_size.dtype,
buffer=shm_rearrange_experts_ips_size.buf,
)
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
shm_rearrange_experts_ips_list = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=self.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=False,
size=1024,
name=self.get_unique_name("rearrange_experts_ips_list"),
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
size=rearrange_experts_status.nbytes,
name=self.get_unique_name("rearrange_experts_status"),
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape,
dtype=rearrange_experts_status.dtype,
buffer=shm_rearrange_experts_status.buf,
)
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{self.local_rank}"
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
signal_update_weight_from_disk_array = IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
size=signal_update_weight_from_disk.nbytes,
name=self.get_unique_name("signal_update_weight_from_disk"),
)
signal_update_weight_from_disk_array = np.ndarray(
signal_update_weight_from_disk.shape,
dtype=signal_update_weight_from_disk.dtype,
buffer=shm_signal_update_weight_from_disk.buf,
)
experts_token_stats = np.zeros((self.num_hidden_layers, 64), dtype=np.int32)
shm_all_experts_token_stats = shared_memory.SharedMemory(
experts_token_stats = np.zeros(
(self.fd_config.model_config.num_hidden_layers, self.fd_config.model_config.moe_num_experts),
dtype=np.int32,
)
shm_all_experts_token_stats = IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
self.update_weight_from_disk_result = IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
size=experts_token_stats.nbytes,
name=self.get_unique_name("all_experts_token_stats"),
)
while True:
if self.rank == 0:
if self.local_rank == 0:
now = int(time.time())
if rearrange_experts_ips_size_array[0] > 0:
if rearrange_experts_ips_size_signal.value[0] > 0:
# step 1. all reduce experts token stats
address = bytes(shm_rearrange_experts_ips_list.buf[: rearrange_experts_ips_size_array[0]]).decode(
"utf-8"
)
address = bytes(
shm_rearrange_experts_ips_list.shm.buf[: rearrange_experts_ips_size_signal.value[0]]
).decode("utf-8")
self.logger.info(f"redundant_expert: all rank ips {address}")
rearrange_experts_ips_size_array[0] = 0
rearrange_experts_status_array[0] = RearrangeExpertState.doing.value
rearrange_experts_ips_size_signal.value[0] = 0
rearrange_experts_signal.value[0] = RearrangeExpertStatus.DOING.value
self.dp_rank_address = address.strip().split(";")
if self.allreduce_experts_stat():
@@ -214,30 +254,25 @@ class RedundantExpertManager:
self.load_weight_begin_ts = now
self.logger.info("redundant_expert: all-reduce experts stats success")
else:
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
self.logger.warning("redundant_expert: all-reduce experts stats fail")
elif self.need_allgather_load_weight_result and self.allreduce_load_weight_result():
# step 3. all reduce the result of load weight from disk
self.need_allgather_load_weight_result = False
rearrange_experts_status_array[0] = RearrangeExpertState.load_succ.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.LOAD_SUCC.value
self.rearrange_end_ts = now
if rearrange_experts_status_array[0] > 1 and (
if rearrange_experts_signal.value[0] > 1 and (
now - self.rearrange_end_ts > self.rearrange_reset_interval
):
# reset rearrange status
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
if signal_update_weight_from_disk_array[0] == 1:
if signal_update_weight_from_disk_array.value[0] == 1:
# step 2. async load weight: disk -> memory
expert_token_stats = np.ndarray(
experts_token_stats.shape,
dtype=experts_token_stats.dtype,
buffer=shm_all_experts_token_stats.buf,
)
self.model_tokens_per_expert_stats_list[:] = expert_token_stats[:]
self.model_tokens_per_expert_stats_list[:] = shm_all_experts_token_stats.value[:]
self.caculate_expert_rank_table()
self.update_weight_from_disk()
signal_update_weight_from_disk_array[0] = 0
signal_update_weight_from_disk_array.value[0] = 0
time.sleep(0.5)
def caculate_expert_rank_table(self, is_init=False):
@@ -274,7 +309,7 @@ class RedundantExpertManager:
self.model_expert_id_to_ep_rank_array[..., : logical_to_physical_map.shape[-1]] = logical_to_physical_map[:]
self.model_expert_in_rank_num_list[:] = expert_count[:]
if self.rank == 0:
if self.local_rank == 0:
workload = RedundantExpertWorkload()
workload.tokens_per_expert_stats_list = self.model_tokens_per_expert_stats_list.tolist()
workload.ep_rank_to_expert_id_list = rank_expert_list.tolist()
@@ -287,18 +322,7 @@ class RedundantExpertManager:
update_weight_from_disk
"""
begin_time = time.time()
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
create=False,
size=result_update_weight_from_disk.nbytes,
name=self.get_unique_name("result_update_weight_from_disk"),
)
result_update_weight_from_disk_array = np.ndarray(
result_update_weight_from_disk.shape,
dtype=result_update_weight_from_disk.dtype,
buffer=shm_result_update_weight_from_disk.buf,
)
result_update_weight_from_disk_array[0] = 0
self.update_weight_from_disk_result.value[0] = 0
self.logger.info(f"redundant_expert: update_weight_from_disk send to async process, rank {self.rank}")
self.parent_mg_conn.send(
@@ -312,7 +336,7 @@ class RedundantExpertManager:
self.tensor_infos = response["weights"]
# 更新权重加载结果
result_update_weight_from_disk_array[0] = 1 if response["result"] else -1
self.update_weight_from_disk_result.value[0] = 1 if response["result"] else -1
self.logger.info(
"redundant_expert: update_weight_from_disk end, rank"
+ f" {self.rank} {response['result']}, cost {int(time.time() - begin_time)}s"
@@ -330,8 +354,8 @@ class RedundantExpertManager:
"""
allgather_expert_token_stats
"""
success_count = 0
expert_token_stats = np.zeros((self.num_hidden_layers, self.num_logical_experts), dtype=np.int32)
success_count = 0
for addr in self.dp_rank_address:
try:
# TODO: 请求失败重试
@@ -347,8 +371,10 @@ class RedundantExpertManager:
+ f"addr {addr}, res {res.status_code} {res.json()}"
)
break
for meta_data in res.json()["data"]:
expert_token_stats += np.array(meta_data, dtype=np.int32)
success_count += 1
expert_token_stats += np.array(res.json()["data"], dtype=np.int32)
except Exception as e:
self.logger.error(f"redundant_expert: allgather_expert_token_stats fail. addr {addr}, error {e}")
if success_count == len(self.dp_rank_address):
@@ -426,18 +452,7 @@ class RedundantExpertManager:
or not self.eplb_config.redundant_expert_enable_schedule_cordon
):
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=False,
size=signal_update_weight_from_tensor.nbytes,
name=self.get_unique_name("signal_update_weight_from_tensor"),
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
signal_update_weight_from_tensor_array[0] = 1
self.signal_update_weight_from_tensor_array.value[0] = 1
return True
def allgather_load_weight_result(self):
@@ -465,140 +480,28 @@ class RedundantExpertManager:
+ f"addr {addr}, res {res.status_code} {res.json()}"
)
break
result = res.json()["data"]
result_list = res.json()["data"]
self.logger.info(
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result {result}"
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result_list {result_list}"
)
if result == 1:
success_count += 1
elif result == -1:
fail_count += 1
self.logger.error(
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
)
exist_fail = True
for result in result_list:
if result == 1:
success_count += 1
elif result == -1:
fail_count += 1
self.logger.error(
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
)
exist_fail = True
except Exception as e:
self.logger.error(f"redundant_expert: allgather_load_weight_result error. addr {addr}, error {e}")
if success_count == len(self.dp_rank_address):
self.logger.info("redundant_expert: allgather_load_weight_result all success")
all_success = True
else:
if fail_count > 0:
self.logger.info(
"redundant_expert: allgather_load_weight_result not all ready, "
+ f"succ {success_count} fail {fail_count} total {len(self.dp_rank_address)}"
)
else:
self.logger.info("redundant_expert: allgather_load_weight_result all success")
all_success = True
return all_success, exist_fail
def init_shared_memory_for_eplb_rank0(rank):
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
create=True,
size=rearrange_experts_ips_size.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_ips_size_dprank' + rank)}",
)
rearrange_experts_ips_size_array = np.ndarray(
rearrange_experts_ips_size.shape,
dtype=rearrange_experts_ips_size.dtype,
buffer=shm_rearrange_experts_ips_size.buf,
)
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
create=True,
size=envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE,
name=f"{envs.get_unique_name('rearrange_experts_ips_list_dprank' + rank)}",
)
# 记录专家重排状态
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
create=True,
size=rearrange_experts_status.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_status_dprank' + rank)}",
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape, dtype=rearrange_experts_status.dtype, buffer=shm_rearrange_experts_status.buf
)
# 接收更新权重的信号
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=True,
size=signal_update_weight_from_tensor.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_tensor_dprank' + rank) }",
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
return (
rearrange_experts_ips_size_array,
shm_rearrange_experts_ips_list,
rearrange_experts_status_array,
signal_update_weight_from_tensor_array,
)
def init_shared_memory_for_eplb_each_rank(fd_config, rank):
# 记录专家负载
num_layers = fd_config.model_config.num_hidden_layers
num_experts = fd_config.model_config.moe_num_experts
experts_token_stats = np.zeros((num_layers, num_experts), dtype=np.int32)
shm_local_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=experts_token_stats.nbytes,
name=f"{envs.get_unique_name('local_experts_token_stats_dprank' + rank)}",
)
local_experts_token_stats_array = np.ndarray(
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_local_experts_token_stats.buf
)
# TODO: 全局专家负载状态是一样的节点上的所有DP可以共用一份但需要避免多个DP同时更新
shm_all_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=experts_token_stats.nbytes,
name=f"{envs.get_unique_name('all_experts_token_stats_dprank' + rank)}",
)
expert_tokens_stats_array = np.ndarray(
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_all_experts_token_stats.buf
)
# 接收加载权重的信号
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
create=True,
size=signal_update_weight_from_disk.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_disk_dprank' + rank)}",
)
signal_update_weight_from_disk_array = np.ndarray(
signal_update_weight_from_disk.shape,
dtype=signal_update_weight_from_disk.dtype,
buffer=shm_signal_update_weight_from_disk.buf,
)
# 记录加载权重的结果
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
create=True,
size=result_update_weight_from_disk.nbytes,
name=f"{envs.get_unique_name('result_update_weight_from_disk_dprank' + rank)}",
)
result_update_weight_from_disk_array = np.ndarray(
result_update_weight_from_disk.shape,
dtype=result_update_weight_from_disk.dtype,
buffer=shm_result_update_weight_from_disk.buf,
)
# 接收清零专家负载的信号
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
shm_signal_clear_experts_token_stats = shared_memory.SharedMemory(
create=True,
size=signal_clear_experts_token_stats.nbytes,
name=f"{envs.get_unique_name('signal_clear_experts_token_stats_dprank' + rank)}",
)
signal_clear_experts_token_stats_array = np.ndarray(
signal_clear_experts_token_stats.shape,
dtype=signal_clear_experts_token_stats.dtype,
buffer=shm_signal_clear_experts_token_stats.buf,
)
return (
local_experts_token_stats_array,
expert_tokens_stats_array,
signal_update_weight_from_disk_array,
result_update_weight_from_disk_array,
signal_clear_experts_token_stats_array,
)

View File

@@ -1,9 +1,27 @@
"""eplb utilities"""
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import json
import os
import time
from enum import Enum
import numpy as np
from fastdeploy.config import FDConfig
from fastdeploy.inter_communicator import IPCSignal
class RedundantExpertWorkload:
@@ -47,13 +65,101 @@ class RedundantExpertWorkload:
return {}, f"redundant_expert: load file {self.meta_file_name} failed, {e}"
class RearrangeExpertState(Enum):
"""RearrangeExpertState"""
def init_eplb_signals(config: FDConfig, ipc_signal_suffix):
"""
Initialize shared memory to indicate eplb status
"""
if config.parallel_config.tensor_parallel_rank != 0:
# only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
return
free = 0
doing = 1
load_succ = 2 # load weight from disk success
done = 3
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{config.parallel_config.local_data_parallel_id}"
# rearrange_experts_status Record the expert's rearrangement status
rearrange_experts_array = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
# Record all DP rank IPs when receiving expert rearrangement requests
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="rearrange_experts_ips_size",
array=rearrange_experts_ips_size_array,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
_ = IPCSignal(
name="rearrange_experts_ips_list",
shm_size=config.eplb_config.redundant_expert_ip_shm_size,
suffix=dp_ipc_signal_suffix,
create=True,
)
# Receive signals for updating weights
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=True,
)
for rank_id in range(config.parallel_config.tensor_parallel_size):
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{rank_id}"
# Record expert workload
experts_token_stats = np.zeros(
(config.model_config.num_hidden_layers, config.model_config.moe_num_experts),
dtype=np.int32,
)
_ = IPCSignal(
name="all_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
_ = IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
# Receive signals for loading weights
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_update_weight_from_disk",
array=signal_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
# Receive signals for clearing expert loads
clear_experts_token_stats = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="signal_clear_experts_token_stats",
array=clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
_ = IPCSignal(
name="result_update_weight_from_disk",
array=result_update_weight_from_disk,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=True,
)
if __name__ == "__main__":

View File

@@ -22,6 +22,7 @@ from .ipc_signal_const import (
KVCacheStatus,
ModelWeightsStatus,
PrefixTreeStatus,
RearrangeExpertStatus,
)
from .zmq_client import ZmqIpcClient
from .zmq_server import ZmqIpcServer, ZmqTcpServer
@@ -38,4 +39,5 @@ __all__ = [
"PrefixTreeStatus",
"ModelWeightsStatus",
"KVCacheStatus",
"RearrangeExpertStatus",
]

View File

@@ -55,10 +55,11 @@ class IPCSignal:
def __init__(
self,
name: str,
array: np.ndarray,
dtype: np.dtype,
array: np.ndarray = None,
dtype: np.dtype = None,
suffix: int = None,
create: bool = True,
shm_size: int = None,
) -> None:
"""Initialize or connect to a shared memory block.
@@ -68,29 +69,45 @@ class IPCSignal:
dtype: Data type of the array (must match array.dtype).
suffix: Suffix number that will be appended to the name.
create: If True, creates new memory block; otherwise connects to existing.
shm_size: Size of the shared memory block in bytes.
Raises:
AssertionError: If create=True but memory already exists, or dtype mismatch.
"""
assert isinstance(array, np.ndarray), "Input must be a numpy array"
assert dtype == array.dtype, "Specified dtype must match array dtype"
# Set a suffix for name to avoid name conflict while there are multiple engine launched
if suffix is not None:
name = name + f".{suffix}"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=array.nbytes, name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
self.value[:] = array # Initialize with input array data
if dtype is None or array is None:
assert shm_size is not None, "shm_size must be specified if array and dtype are None"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=shm_size, name=name)
self.value = None
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value = None
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
assert isinstance(array, np.ndarray), "Input must be a numpy array"
assert dtype == array.dtype, "Specified dtype must match array dtype"
if create:
llm_logger.debug(f"creating ipc signal: {name}")
if shared_memory_exists(name):
llm_logger.warning(f"ShareMemory: {name} already exists, delete it")
SharedMemory(name=name, create=False).unlink()
self.shm = SharedMemory(create=True, size=array.nbytes, name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
self.value[:] = array # Initialize with input array data
else:
llm_logger.debug(f"attaching ipc signal: {name}")
self.shm = SharedMemory(name=name)
self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf)
def clear(self) -> None:
"""Release system resources and unlink the shared memory block."""

View File

@@ -1,4 +1,21 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from dataclasses import dataclass
from enum import Enum
@dataclass
@@ -30,3 +47,10 @@ class ExistTaskStatus:
EMPTY = 0
EXIST = 1
REFUSE = 2
class RearrangeExpertStatus(Enum):
FREE = 0
DOING = 1
LOAD_SUCC = 2 # load weight from disk success
DONE = 3

View File

@@ -368,7 +368,7 @@ class Ernie4_5_Model(nn.Layer):
fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.fd_config = fd_config
self.redundant_table_manger = None
if fd_config.model_config.enable_redundant_experts is True:
if fd_config.eplb_config.enable_eplb is True:
self.redundant_table_manger = RedundantExpertManger(
n_routed_experts=fd_config.model_config.moe_num_experts,
num_hidden_layers=fd_config.model_config.num_hidden_layers,

View File

@@ -64,6 +64,7 @@ class RolloutModelConfig:
plas_attention_config: str = None,
data_parallel_size: int = 1,
num_nextn_predict_layers: int = 0,
eplb_config: str = {},
):
# Required parameters
self.model = model_name_or_path
@@ -111,6 +112,7 @@ class RolloutModelConfig:
self.ips = None
self.plas_attention_config = plas_attention_config
self.num_nextn_predict_layers = num_nextn_predict_layers
self.eplb_config = eplb_config
def __str__(self):
return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items())

View File

@@ -18,7 +18,6 @@ import argparse
import json
import os
import time
from multiprocessing import shared_memory
from typing import Tuple
import numpy as np
@@ -49,9 +48,13 @@ from fastdeploy.eplb.async_expert_loader import (
load_tensor_from_shm_mem,
)
from fastdeploy.eplb.experts_manager import RedundantExpertManager
from fastdeploy.eplb.utils import RearrangeExpertState
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
from fastdeploy.inter_communicator import (
ExistTaskStatus,
IPCSignal,
ModelWeightsStatus,
RearrangeExpertStatus,
)
from fastdeploy.model_executor.layers.quantization import parse_quant_config
from fastdeploy.model_executor.utils import v1_loader_support
from fastdeploy.platforms import current_platform
@@ -287,68 +290,122 @@ class PaddleDisWorkerProc:
else:
paddle.distributed.barrier(self.parallel_config.tp_group)
def _init_eplb_signal(self):
if not self.eplb_config.enable_eplb:
return
local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
self.last_dump_expert_workload_ts = 0
self.experts_manager = RedundantExpertManager(
rank=self.local_rank,
ep_size=self.ranks,
fd_config=self.fd_config,
ipc_signal_suffix=self.parallel_config.engine_worker_queue_port,
)
dp_ipc_signal_suffix = (
f"{self.parallel_config.engine_worker_queue_port}_dp{self.parallel_config.local_data_parallel_id}"
)
if local_rank == 0: # master rank0
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
self.signal_update_weight_from_tensor_array = IPCSignal(
name="signal_update_weight_from_tensor",
array=signal_update_weight_from_tensor,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
self.rearrange_experts_signal = IPCSignal(
name="rearrange_experts_status",
array=rearrange_experts_status,
dtype=np.int32,
suffix=dp_ipc_signal_suffix,
create=False,
)
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{local_rank}"
experts_token_stats = np.zeros(
(self.fd_config.model_config.num_hidden_layers, self.fd_config.model_config.moe_num_experts),
dtype=np.int32,
)
self.local_experts_token_stats_array = IPCSignal(
name="local_experts_token_stats",
array=experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
clear_experts_token_stats = np.zeros([1], dtype=np.int32)
self.signal_clear_experts_token_stats = IPCSignal(
name="signal_clear_experts_token_stats",
array=clear_experts_token_stats,
dtype=np.int32,
suffix=tp_ipc_signal_suffix,
create=False,
)
self.mmap_infos = create_mmap(
[MODEL_MAIN_NAME],
self.local_rank,
self.ranks,
shm_uuid=self.parallel_config.engine_worker_queue_port,
eplb_config=self.eplb_config,
logger=logger,
)
def _run_eplb(self, tp_rank):
"""internal call to run eplb"""
if not self.eplb_config.enable_eplb:
return
rearrange_time = time.time()
# Get expert load
if self.local_experts_token_stats_array.value is not None and (
int(rearrange_time) - self.last_dump_expert_workload_ts
> self.eplb_config.redundant_expert_dump_workload_interval
):
self.last_dump_expert_workload_ts = int(rearrange_time)
clear_stat = False
if self.signal_clear_experts_token_stats.value[0] == 1:
clear_stat = True
self.signal_clear_experts_token_stats.value[0] = 0
(
new_stats_array,
_,
_,
_,
) = self.worker.get_model().redundant_table_manger.get_expert_tokens_stats(clear_stat=clear_stat)
self.local_experts_token_stats_array.value[:] = new_stats_array[:]
elif self.local_experts_token_stats_array.value is None:
logger.warning("redundant_expert: local_experts_token_stats not init")
# All DP synchronously update weights
broadcast_value = 0
if tp_rank == 0 and self.signal_update_weight_from_tensor_array.value[0] == 1:
logger.info("redundant_expert: update_weight_from_tensor broadcast signal")
self.signal_update_weight_from_tensor_array.value[0] = 0
broadcast_value = REARRANGE_EXPERT_MAGIC_NUM
data = paddle.to_tensor([broadcast_value])
paddle.distributed.broadcast(data, 0)
if data[0] == REARRANGE_EXPERT_MAGIC_NUM:
self.update_weights_from_tensor(self.mmap_infos)
logger.info(
f"redundant_expert: update_weight_from_tensor success, cost {(time.time() - rearrange_time)*1000}ms"
)
paddle.distributed.barrier()
if tp_rank == 0:
self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value
logger.info("redundant_expert: done")
def event_loop_normal(self) -> None:
"""Main event loop for Paddle Distributed Workers.
TODO(gongshaotian): support remote calling of functions that control worker.
"""
if self.eplb_config.enable_redundant_experts:
self.last_dump_expert_workload_ts = 0
self.experts_manager = RedundantExpertManager(
rank=self.local_rank, ep_size=self.ranks, fd_config=self.fd_config
)
num_layers = self.fd_config.model_config.num_hidden_layers
num_experts = self.fd_config.model_config.moe_num_experts
expert_token_stats = np.zeros((num_layers, num_experts), dtype=np.int32)
shm_local_experts_token_stats = shared_memory.SharedMemory(
create=False,
size=expert_token_stats.nbytes,
name=f"{envs.get_unique_name('local_experts_token_stats_dprank' + self.local_rank)}",
)
expert_tokens_stats_array = np.ndarray(
expert_token_stats.shape, dtype=expert_token_stats.dtype, buffer=shm_local_experts_token_stats.buf
)
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
shm_signal_clear_experts_token_stats = shared_memory.SharedMemory(
create=False,
size=signal_clear_experts_token_stats.nbytes,
name=f"{envs.get_unique_name('signal_clear_experts_token_stats_dprank' + self.local_rank)}",
)
signal_clear_experts_token_stats_array = np.ndarray(
signal_clear_experts_token_stats.shape,
dtype=signal_clear_experts_token_stats.dtype,
buffer=shm_signal_clear_experts_token_stats.buf,
)
if self.local_rank == 0:
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
create=False,
size=signal_update_weight_from_tensor.nbytes,
name=f"{envs.get_unique_name('signal_update_weight_from_tensor_dprank' + self.local_rank)}",
)
signal_update_weight_from_tensor_array = np.ndarray(
signal_update_weight_from_tensor.shape,
dtype=signal_update_weight_from_tensor.dtype,
buffer=shm_signal_update_weight_from_tensor.buf,
)
rearrange_experts_status = np.zeros([1], dtype=np.int32)
shm_rearrange_experts_status = shared_memory.SharedMemory(
create=False,
size=rearrange_experts_status.nbytes,
name=f"{envs.get_unique_name('rearrange_experts_status_dprank' + self.local_rank)}",
)
rearrange_experts_status_array = np.ndarray(
rearrange_experts_status.shape,
dtype=rearrange_experts_status.dtype,
buffer=shm_rearrange_experts_status.buf,
)
expert_workload_dump_interval = envs.FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL
mmap_infos = create_mmap(
[MODEL_MAIN_NAME], self.local_rank, self.ranks, shm_uuid=os.getenv("SHM_UUID", ""), logger=logger
)
# init eplb signal
self._init_eplb_signal()
tp_size = self.parallel_config.tensor_parallel_size
# Currently, only support single node
self.nnode = int((tp_size + 7) // 8)
@@ -358,44 +415,8 @@ class PaddleDisWorkerProc:
self.model_weights_signal = np.zeros([1], dtype=np.int32)
while True:
if self.eplb_config.enable_redundant_experts:
rearrange_time = time.time()
# 获取专家负载
if expert_tokens_stats_array is not None and (
int(rearrange_time) - self.last_dump_expert_workload_ts > expert_workload_dump_interval
):
self.last_dump_expert_workload_ts = int(rearrange_time)
clear_stat = False
if signal_clear_experts_token_stats_array[0] == 1:
clear_stat = True
signal_clear_experts_token_stats_array[0] = 0
(
new_stats_array,
_,
_,
_,
) = self.worker.get_model().redundant_table_manger.get_expert_tokens_stats(clear_stat=clear_stat)
expert_tokens_stats_array[:] = new_stats_array[:]
elif expert_tokens_stats_array is None:
logger.warning("redundant_expert: expert_tokens_stats_array not init")
# 所有DP同步更新权重
broadcast_value = 0
if self.local_rank == 0 and signal_update_weight_from_tensor_array[0] == 1:
logger.info("redundant_expert: update_weight_from_tensor broadcast signal")
signal_update_weight_from_tensor_array[0] = 0
broadcast_value = REARRANGE_EXPERT_MAGIC_NUM
data = paddle.to_tensor([broadcast_value])
paddle.distributed.broadcast(data, 0)
if data[0] == REARRANGE_EXPERT_MAGIC_NUM:
self.update_weights_from_tensor(mmap_infos)
logger.info(
f"redundant_expert: update_weight_from_tensor success, cost {(time.time() - rearrange_time)*1000}ms"
)
paddle.distributed.barrier()
if self.local_rank == 0:
rearrange_experts_status_array[0] = RearrangeExpertState.done.value
logger.info("redundant_expert: done")
# run eplb
self._run_eplb(tp_rank)
if tp_rank == 0:
if self.model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
self.model_weights_signal[0] = int(self.model_weights_status.value[0])
@@ -842,6 +863,13 @@ def parse_args():
help="FQCNs (Fully Qualified Class Names) of logits processors supported by the service.",
)
parser.add_argument(
"--eplb_config",
type=json.loads,
default=None,
help="EPLB Configuration.",
)
args = parser.parse_args()
return args
@@ -897,7 +925,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
plas_attention_config = PlasAttentionConfig(args.plas_attention_config)
early_stop_config = EarlyStopConfig(args.early_stop_config)
eplb_config = EPLBConfig()
eplb_config = EPLBConfig(args.eplb_config)
structured_outputs_config: StructuredOutputsConfig = StructuredOutputsConfig(args=vars(args))

View File

@@ -46,3 +46,4 @@ msgspec
einops
setproctitle
aistudio_sdk
cuda-python==12.8

View File

@@ -0,0 +1,211 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import tempfile
import unittest
from unittest.mock import MagicMock, patch
import numpy as np
from fastdeploy.config import EPLBConfig
from fastdeploy.eplb.async_expert_loader import (
AsyncEPLoader,
create_mmap,
load_ep_checkpoint,
load_model_weights_process,
)
class TestAsyncExpertLoader(unittest.TestCase):
"""Test cases for async_expert_loader.py"""
def setUp(self):
"""Set up test fixtures"""
self.temp_dir = tempfile.mkdtemp()
args = {
"redundant_expert_async_load_model_shmem_size_gb": 1,
"model_use_safetensors": False,
"moe_quant_type": "",
}
self.eplb_config = EPLBConfig(args)
def tearDown(self):
"""Clean up test fixtures"""
import shutil
shutil.rmtree(self.temp_dir)
def test_create_mmap(self):
"""Test create_mmap function"""
# Mock cuda import and functions
with patch("fastdeploy.eplb.async_expert_loader.cudart", create=True) as mock_cudart:
# Create proper mock for cudaError_t
class MockCudaErrorT:
cudaSuccess = 0
cudaErrorInvalidValue = 1
mock_cudart.cudaError_t = MockCudaErrorT
# Setup mock to return proper cudaError_t instance
mock_cudart.cudaHostRegister.return_value = (mock_cudart.cudaError_t.cudaSuccess,)
mock_cudart.cudaGetErrorString.return_value = (mock_cudart.cudaError_t.cudaSuccess, b"Success")
model_name = ["test_model"]
ep_rank = 0
ep_size = 1
shm_uuid = "test_uuid"
# Mock logger
mock_logger = MagicMock()
with (
patch("os.path.isfile", return_value=False),
patch("os.open"),
patch("os.ftruncate"),
patch("ctypes.CDLL") as mock_libc,
patch("ctypes.addressof") as mock_addressof,
patch("ctypes.cast") as mock_cast,
):
mock_libc.return_value.mmap.return_value = 12345 # Mock mmap pointer
mock_addressof.return_value = 12345 # Mock address
mock_cast.contents = 12345 # Mock cast
result = create_mmap(model_name, ep_rank, ep_size, shm_uuid, self.eplb_config, mock_logger)
self.assertIn("test_model", result)
def test_load_ep_checkpoint(self):
"""Test load_ep_checkpoint function"""
# Create test index file
index_file = os.path.join(self.temp_dir, "model.safetensors.index.json")
index_data = {"weight_map": {"weight1": "file1.safetensors", "weight2": "file2.safetensors"}}
import json
with open(index_file, "w") as f:
json.dump(index_data, f)
# Test loading checkpoint
result = load_ep_checkpoint(self.temp_dir)
self.assertEqual(len(result), 2)
self.assertIn("weight1", result)
self.assertIn("weight2", result)
def test_async_ep_loader_init(self):
"""Test AsyncEPLoader initialization"""
model_dir = "/test/model"
rank = 0
expert_per_rank = 8
moe_layer_start_index = 3
moe_quant_type = ""
mock_logger = MagicMock()
loader = AsyncEPLoader(
model_dir=model_dir,
eplb_config=self.eplb_config,
rank=rank,
expert_per_rank=expert_per_rank,
moe_layer_start_index=moe_layer_start_index,
moe_quant_type=moe_quant_type,
logger=mock_logger,
)
self.assertEqual(loader.model_path, model_dir)
self.assertEqual(loader.ep_rank, rank)
self.assertEqual(loader.expert_per_rank, expert_per_rank)
self.assertEqual(loader.moe_layer_start_index, moe_layer_start_index)
def test_async_ep_loader_reset(self):
"""Test AsyncEPLoader reset method"""
mock_logger = MagicMock()
loader = AsyncEPLoader(model_dir="/test/model", eplb_config=self.eplb_config, logger=mock_logger)
# Set some state
loader.old_model_ep_rank_to_expert_id_list = np.array([[1, 2]])
loader.cached_weights = [("test", "weight")]
# Reset
loader.reset()
self.assertIsNone(loader.old_model_ep_rank_to_expert_id_list)
self.assertIsNone(loader.new_model_ep_rank_to_expert_id_list)
self.assertEqual(len(loader.cached_weights), 0)
@patch("fastdeploy.eplb.async_expert_loader.paddle.load")
@patch("os.path.exists")
def test_load_weight_bf16_from_disk(self, mock_exists, mock_paddle_load):
"""Test load_weight_bf16_from_disk method"""
mock_exists.return_value = True
mock_paddle_load.return_value = "test_weight"
mock_logger = MagicMock()
loader = AsyncEPLoader(model_dir=self.temp_dir, eplb_config=self.eplb_config, logger=mock_logger)
need_to_reload = [(3, 0)] # layer_id, expert_id
# Mock paddle.device.get_device and set_device
with patch("paddle.device.get_device", return_value="cpu"), patch("paddle.set_device"):
success, message = loader.load_weight_bf16_from_disk(need_to_reload)
self.assertTrue(success)
self.assertIn("Succeeded", message)
def test_load_model_weights_process_integration(self):
"""Test load_model_weights_process function"""
# This is a complex integration test that would require mocking many components
# For now, we'll test that the function can be called without errors
try:
# Mock all the dependencies
with (
patch("fastdeploy.eplb.async_expert_loader.setproctitle"),
patch("fastdeploy.eplb.async_expert_loader.faulthandler"),
patch("fastdeploy.eplb.async_expert_loader.paddle.set_device"),
patch("fastdeploy.eplb.async_expert_loader.AsyncEPLoader") as mock_loader_class,
):
mock_loader = MagicMock()
mock_loader_class.return_value = mock_loader
mock_loader.load_experts_weight_from_disk.return_value = (True, "success")
mock_loader.cached_weights = []
# Mock connections
mock_mg_conn = MagicMock()
mock_data_conn = MagicMock()
# Mock the function call
load_model_weights_process(
rank=0,
model_dir=self.temp_dir,
expert_per_rank=8,
moe_layer_start_index=3,
moe_quant_type="",
shm_uuid="test",
eplb_config=self.eplb_config,
data_conn=mock_data_conn,
mg_conn=mock_mg_conn,
)
# Verify that the loader was created
mock_loader_class.assert_called_once()
except Exception:
# The function might fail due to missing dependencies, but we want to test the structure
self.assertTrue(True) # Basic structure test passed
if __name__ == "__main__":
unittest.main()

246
tests/eplb/test_eplb.py Normal file
View File

@@ -0,0 +1,246 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import unittest
import numpy as np
from fastdeploy.eplb.eplb import (
balanced_packing,
rebalance_experts,
rebalance_experts_hierarchical,
rebalance_experts_intra_node,
replicate_experts,
)
class TestEplb(unittest.TestCase):
"""Test cases for eplb.py"""
def test_balanced_packing_simple(self):
"""Test balanced_packing with simple case"""
# Test case with 4 items and 2 packs
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_packs = 2
pack_index, rank_in_pack = balanced_packing(weight, num_packs)
expected_pack_index = np.array([[0, 1, 1, 0]], dtype=np.int32)
expected_rank_in_pack = np.array([[1, 1, 0, 0]], dtype=np.int32)
np.testing.assert_array_equal(pack_index, expected_pack_index)
np.testing.assert_array_equal(rank_in_pack, expected_rank_in_pack)
def test_balanced_packing_single_pack(self):
"""Test balanced_packing with single pack"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_packs = 4 # Each pack gets exactly one item
pack_index, rank_in_pack = balanced_packing(weight, num_packs)
expected_pack_index = np.array([[0, 1, 2, 3]], dtype=np.int32)
expected_rank_in_pack = np.array([[0, 0, 0, 0]], dtype=np.int32)
np.testing.assert_array_equal(pack_index, expected_pack_index)
np.testing.assert_array_equal(rank_in_pack, expected_rank_in_pack)
def test_balanced_packing_multiple_layers(self):
"""Test balanced_packing with multiple layers"""
weight = np.array([[1, 2, 3, 4], [4, 3, 2, 1]], dtype=np.float32)
num_packs = 2
pack_index, rank_in_pack = balanced_packing(weight, num_packs)
# Verify shape
self.assertEqual(pack_index.shape, (2, 4))
self.assertEqual(rank_in_pack.shape, (2, 4))
# Verify that each pack gets exactly 2 items per layer
for layer_idx in range(2):
unique_packs, counts = np.unique(pack_index[layer_idx], return_counts=True)
np.testing.assert_array_equal(counts, [2, 2])
def test_replicate_experts_no_redundancy(self):
"""Test replicate_experts with no redundant experts"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_phy = 4 # Same as number of logical experts
phy2log, rank, logcnt = replicate_experts(weight, num_phy)
expected_phy2log = np.array([[0, 1, 2, 3]], dtype=np.int32)
expected_rank = np.array([[0, 0, 0, 0]], dtype=np.int32)
expected_logcnt = np.array([[1, 1, 1, 1]], dtype=np.int32)
np.testing.assert_array_equal(phy2log, expected_phy2log)
np.testing.assert_array_equal(rank, expected_rank)
np.testing.assert_array_equal(logcnt, expected_logcnt)
def test_replicate_experts_with_redundancy(self):
"""Test replicate_experts with redundant experts"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_phy = 6 # 2 redundant experts
phy2log, rank, logcnt = replicate_experts(weight, num_phy)
# Verify shape
self.assertEqual(phy2log.shape, (1, 6))
self.assertEqual(rank.shape, (1, 6))
self.assertEqual(logcnt.shape, (1, 4))
# Verify that each logical expert has correct count
expected_logcnt = np.array([[1, 1, 2, 2]], dtype=np.int32) # Heaviest and lightest get replicated
np.testing.assert_array_equal(logcnt, expected_logcnt)
def test_rebalance_experts_intra_node(self):
"""Test rebalance_experts_intra_node function"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_physical_experts = 4
num_groups = 1
num_nodes = 1
num_gpus = 1
phy2log, phyrank, logcnt = rebalance_experts_intra_node(
weight, num_physical_experts, num_groups, num_nodes, num_gpus
)
# Verify shape
self.assertEqual(phy2log.shape, (1, 4))
self.assertEqual(phyrank.shape, (1, 4))
self.assertEqual(logcnt.shape, (1, 4))
def test_rebalance_experts_hierarchical(self):
"""Test rebalance_experts_hierarchical function"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_physical_experts = 4
num_groups = 2
num_nodes = 1
num_gpus = 1
phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
weight, num_physical_experts, num_groups, num_nodes, num_gpus
)
# Verify shape
self.assertEqual(phy2log.shape, (1, 4))
self.assertEqual(phyrank.shape, (1, 4))
self.assertEqual(logcnt.shape, (1, 4))
def test_rebalance_experts_balance_intra_node(self):
"""Test rebalance_experts with balance_intra_node strategy"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_replicas = 4
num_groups = 1
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = rebalance_experts(
weight, num_replicas, num_groups, num_nodes, num_gpus, "balance_intra_node"
)
# Verify shape
self.assertEqual(phy2log.shape, (1, 4))
self.assertEqual(log2phy.shape, (1, 4, 1)) # maxlogcnt = 1 when no redundancy
self.assertEqual(logcnt.shape, (1, 4))
def test_rebalance_experts_hierarchical_strategy(self):
"""Test rebalance_experts with hierarchical strategy"""
weight = np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=np.float32)
num_replicas = 8
num_groups = 4 # Divisible by num_nodes
num_nodes = 2
num_gpus = 4
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas, num_groups, num_nodes, num_gpus)
# Verify shape
self.assertEqual(phy2log.shape, (1, 8))
self.assertEqual(log2phy.shape, (1, 8, 1)) # maxlogcnt = 1 when no redundancy
self.assertEqual(logcnt.shape, (1, 8))
def test_rebalance_experts_global_strategy(self):
"""Test rebalance_experts with global strategy (groups not divisible by nodes)"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_replicas = 4
num_groups = 3 # Not divisible by num_nodes
num_nodes = 2
num_gpus = 2
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas, num_groups, num_nodes, num_gpus)
# Verify shape
self.assertEqual(phy2log.shape, (1, 4))
self.assertEqual(log2phy.shape, (1, 4, 1))
self.assertEqual(logcnt.shape, (1, 4))
def test_rebalance_experts_with_redundancy(self):
"""Test rebalance_experts with redundant experts"""
weight = np.array([[1, 2, 3, 4]], dtype=np.float32)
num_replicas = 6 # 2 redundant experts
num_groups = 1
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas, num_groups, num_nodes, num_gpus)
# Verify shape
self.assertEqual(phy2log.shape, (1, 6))
self.assertEqual(log2phy.shape, (1, 4, 2)) # maxlogcnt = 2 with redundancy
self.assertEqual(logcnt.shape, (1, 4))
# Verify that logical expert counts sum to num_replicas
self.assertEqual(logcnt.sum(), num_replicas)
def test_edge_cases(self):
"""Test edge cases for rebalance_experts"""
# Test with all zero weights
weight = np.zeros((2, 4), dtype=np.float32)
num_replicas = 4
num_groups = 1
num_nodes = 1
num_gpus = 1
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas, num_groups, num_nodes, num_gpus)
# Should still produce valid results
self.assertEqual(phy2log.shape, (2, 4))
self.assertEqual(log2phy.shape, (2, 4, 1))
self.assertEqual(logcnt.shape, (2, 4))
def test_large_scale(self):
"""Test with larger scale parameters"""
num_layers = 10
num_experts = 64
weight = np.random.randint(1, 100, size=(num_layers, num_experts)).astype(np.float32)
num_replicas = 64
num_groups = 8
num_nodes = 4
num_gpus = 32
phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas, num_groups, num_nodes, num_gpus)
# Verify shape
self.assertEqual(phy2log.shape, (num_layers, num_replicas))
self.assertEqual(log2phy.shape[0], num_layers)
self.assertEqual(log2phy.shape[1], num_experts)
self.assertEqual(logcnt.shape, (num_layers, num_experts))
# Verify that logical expert counts sum to num_replicas for each layer
for layer_idx in range(num_layers):
self.assertEqual(logcnt[layer_idx].sum(), num_replicas)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,364 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import json
import os
import tempfile
import unittest
from dataclasses import asdict
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import numpy as np
from fastdeploy.config import (
CacheConfig,
EPLBConfig,
FDConfig,
ParallelConfig,
SchedulerConfig,
)
from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.eplb.utils import RedundantExpertWorkload, init_eplb_signals
class TestRedundantExpertWorkload(unittest.TestCase):
"""Test cases for RedundantExpertWorkload class"""
def setUp(self):
"""Set up test fixtures"""
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
"""Clean up test fixtures"""
import shutil
shutil.rmtree(self.temp_dir)
def test_init(self):
"""Test RedundantExpertWorkload initialization"""
workload = RedundantExpertWorkload(self.temp_dir)
self.assertIsNone(workload.tokens_per_expert_stats_list)
self.assertIsNone(workload.ep_rank_to_expert_id_list)
self.assertIsNone(workload.expert_id_to_ep_rank_array)
self.assertIsNone(workload.expert_in_rank_num_list)
self.assertEqual(workload.cost_milliseconds, 0)
self.assertEqual(workload.meta_file_name, f"{self.temp_dir}/rearrange-experts.json")
# Verify directory was created
self.assertTrue(os.path.exists(self.temp_dir))
def test_json_method(self):
"""Test __json__ method"""
workload = RedundantExpertWorkload(self.temp_dir)
workload.tokens_per_expert_stats_list = [[1, 2], [3, 4]]
workload.ep_rank_to_expert_id_list = [[0, 1]]
workload.expert_id_to_ep_rank_array = [[[0], [1]]]
workload.expert_in_rank_num_list = [[1, 1]]
workload.cost_milliseconds = 100
json_data = workload.__json__()
self.assertEqual(json_data["tokens_per_expert_stats_list"], [[1, 2], [3, 4]])
self.assertEqual(json_data["ep_rank_to_expert_id_list"], [[0, 1]])
self.assertEqual(json_data["expert_id_to_ep_rank_array"], [[[0], [1]]])
self.assertEqual(json_data["expert_in_rank_num_list"], [[1, 1]])
self.assertEqual(json_data["cost_milliseconds"], 100)
def test_dump_success(self):
"""Test successful dump"""
workload = RedundantExpertWorkload(self.temp_dir)
workload.tokens_per_expert_stats_list = [[1, 2]]
workload.ep_rank_to_expert_id_list = [[0, 1]]
workload.expert_id_to_ep_rank_array = [[[0], [1]]]
workload.expert_in_rank_num_list = [[1, 1]]
workload.cost_milliseconds = 100
result = workload.dump()
# Verify file was created
self.assertTrue(os.path.exists(workload.meta_file_name))
# Verify file content
with open(workload.meta_file_name, "r") as f:
saved_data = json.load(f)
self.assertEqual(saved_data["tokens_per_expert_stats_list"], [[1, 2]])
self.assertEqual(saved_data["ep_rank_to_expert_id_list"], [[0, 1]])
self.assertEqual(saved_data["expert_id_to_ep_rank_array"], [[[0], [1]]])
self.assertEqual(saved_data["expert_in_rank_num_list"], [[1, 1]])
self.assertEqual(saved_data["cost_milliseconds"], 100)
# Verify return message
self.assertIn("redundant_expert: dump expert workload result in", result)
def test_load_success(self):
"""Test successful load"""
# Create test file
test_data = {
"tokens_per_expert_stats_list": [[1, 2], [3, 4]],
"ep_rank_to_expert_id_list": [[0, 1]],
"expert_id_to_ep_rank_array": [[[0], [1]]],
"expert_in_rank_num_list": [[1, 1]],
"cost_milliseconds": 100,
}
with open(os.path.join(self.temp_dir, "rearrange-experts.json"), "w") as f:
json.dump(test_data, f)
workload = RedundantExpertWorkload(self.temp_dir)
data, message = workload.load()
# Verify loaded data
self.assertEqual(data["tokens_per_expert_stats_list"], [[1, 2], [3, 4]])
self.assertEqual(data["ep_rank_to_expert_id_list"], [[0, 1]])
self.assertEqual(data["expert_id_to_ep_rank_array"], [[[0], [1]]])
self.assertEqual(data["expert_in_rank_num_list"], [[1, 1]])
self.assertEqual(data["cost_milliseconds"], 100)
self.assertEqual(message, "ok")
def test_load_file_not_exists(self):
"""Test load when file doesn't exist"""
workload = RedundantExpertWorkload(self.temp_dir)
data, message = workload.load()
self.assertEqual(data, {})
self.assertIn("is not exists", message)
def test_load_corrupted_file(self):
"""Test load with corrupted JSON file"""
# Create corrupted JSON file
with open(os.path.join(self.temp_dir, "rearrange-experts.json"), "w") as f:
f.write("invalid json content")
workload = RedundantExpertWorkload(self.temp_dir)
data, message = workload.load()
self.assertEqual(data, {})
self.assertIn("load file", message)
self.assertIn("failed", message)
class TestInitEplbSignals(unittest.TestCase):
"""Test cases for init_eplb_signals function"""
def setUp(self):
"""Set up test fixtures"""
max_num_seqs = 2
engine_args = EngineArgs(
max_num_seqs=max_num_seqs,
num_gpu_blocks_override=102,
max_num_batched_tokens=3200,
)
args = asdict(engine_args)
cache_cfg = CacheConfig(args)
model_cfg = SimpleNamespace(enable_mm=True) # Enable multimodal for feature testing
speculative_cfg = SimpleNamespace(method=None)
model_cfg.print = print
model_cfg.max_model_len = 5120
model_cfg.num_hidden_layers = 3
model_cfg.moe_num_experts = 64
model_cfg.moe_layer_start_index = 1
model_cfg.model = "/test/model"
cache_cfg.bytes_per_layer_per_block = 1
parallel_cfg = ParallelConfig(args)
scheduler_cfg = SchedulerConfig(args)
graph_opt_cfg = engine_args.create_graph_optimization_config()
eplb_args = {
"redundant_experts_num": 0,
"redundant_expert_api_user": "test_user",
"redundant_expert_api_password": "test_pass",
"redundant_expert_eplb_strategy": "",
"redundant_expert_ip_shm_size": 1024,
"moe_quant_type": "",
"redundant_expert_enable_schedule_cordon": False,
}
eplb_config = EPLBConfig(eplb_args)
self.fd_config = FDConfig(
model_config=model_cfg,
cache_config=cache_cfg,
parallel_config=parallel_cfg,
graph_opt_config=graph_opt_cfg,
speculative_config=speculative_cfg,
scheduler_config=scheduler_cfg,
eplb_config=eplb_config,
)
self.fd_config.parallel_config.local_data_parallel_id = 0
@patch("fastdeploy.eplb.utils.IPCSignal")
def test_init_eplb_signals_rank_0(self, mock_ipc_signal):
"""Test init_eplb_signals for rank 0"""
mock_ipc_instance = MagicMock()
mock_ipc_signal.return_value = mock_ipc_instance
# Test with rank 0
self.fd_config.parallel_config.local_data_parallel_id = 0
ipc_signal_suffix = 123
init_eplb_signals(self.fd_config, ipc_signal_suffix)
# Verify IPCSignal was called for rank 0 specific signals
expected_calls = [
# Rank 0 specific signals
("rearrange_experts_status", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
("rearrange_experts_ips_size", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
("rearrange_experts_ips_list", 1024, None, ipc_signal_suffix, True), # shm_size
("signal_update_weight_from_tensor", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
# Common signals
("all_experts_token_stats", np.zeros((3, 64), dtype=np.int32), np.int32, ipc_signal_suffix, True),
("local_experts_token_stats", np.zeros((3, 64), dtype=np.int32), np.int32, ipc_signal_suffix, True),
("signal_update_weight_from_disk", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
("signal_clear_experts_token_stats", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
("result_update_weight_from_disk", np.zeros([1], dtype=np.int32), np.int32, ipc_signal_suffix, True),
]
# Verify all signals were created
self.assertEqual(mock_ipc_signal.call_count, len(expected_calls))
@patch("fastdeploy.eplb.utils.IPCSignal")
def test_init_eplb_signals_rank_non_zero(self, mock_ipc_signal):
"""Test init_eplb_signals for non-zero rank"""
mock_ipc_instance = MagicMock()
mock_ipc_signal.return_value = mock_ipc_instance
# Test with non-zero rank
self.fd_config.parallel_config.tensor_parallel_rank = 0
self.fd_config.parallel_config.tensor_parallel_size = 1
self.fd_config.parallel_config.local_data_parallel_id = 1
self.fd_config.eplb_config.redundant_expert_ip_shm_size = 1024
ipc_signal_suffix = 123
init_eplb_signals(self.fd_config, ipc_signal_suffix)
# For non-zero rank, only common signals should be created
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp1"
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp0"
expected_calls = [
# Common signals (no rank 0 specific signals)
("rearrange_experts_status", np.zeros([1], dtype=np.int32), np.int32, dp_ipc_signal_suffix, True),
("rearrange_experts_ips_size", np.zeros([1], dtype=np.int32), np.int32, dp_ipc_signal_suffix, True),
("rearrange_experts_ips_list", 1024, dp_ipc_signal_suffix, True),
("signal_update_weight_from_tensor", np.zeros([1], dtype=np.int32), np.int32, dp_ipc_signal_suffix, True),
("all_experts_token_stats", np.zeros((3, 64), dtype=np.int32), np.int32, tp_ipc_signal_suffix, True),
("local_experts_token_stats", np.zeros((3, 64), dtype=np.int32), np.int32, tp_ipc_signal_suffix, True),
("signal_update_weight_from_disk", np.zeros([1], dtype=np.int32), np.int32, tp_ipc_signal_suffix, True),
("signal_clear_experts_token_stats", np.zeros([1], dtype=np.int32), np.int32, tp_ipc_signal_suffix, True),
("result_update_weight_from_disk", np.zeros([1], dtype=np.int32), np.int32, tp_ipc_signal_suffix, True),
]
# Verify only common signals were created
self.assertEqual(mock_ipc_signal.call_count, len(expected_calls))
# Get all actual calls and verify each parameter
actual_calls = mock_ipc_signal.call_args_list
# Verify each call matches expected parameters
for i, expected in enumerate(expected_calls):
call = actual_calls[i]
# Extract call arguments
if len(call) == 2: # args and kwargs
args, kwargs = call
actual_args = args if isinstance(args, tuple) else (args,)
suffix = kwargs.get("suffix")
else:
actual_args = call if isinstance(call, tuple) else (call,)
suffix = None
# Skip verification if we can't access the expected parameters
if len(expected) < 1:
continue
# Verify signal name is present
if len(actual_args) > 0:
self.assertEqual(actual_args[0], expected[0], f"Signal name mismatch at call {i}")
else:
continue
# Special handling for rearrange_experts_ips_list
if expected[0] == "rearrange_experts_ips_list":
continue
# Verify array/values if present
if len(expected) > 1 and len(actual_args) > 1:
if isinstance(expected[1], np.ndarray):
np.testing.assert_array_equal(actual_args[1], expected[1], f"Array mismatch at call {i}")
else:
self.assertEqual(actual_args[1], expected[1], f"Value mismatch at call {i}")
# Verify data type if present
if len(expected) > 2 and len(actual_args) > 2:
self.assertEqual(actual_args[2], expected[2], f"Data type mismatch at call {i}")
# Verify suffix if present
if len(expected) > 3:
if suffix is not None:
self.assertEqual(suffix, expected[3], f"IPC suffix mismatch at call {i}")
elif len(actual_args) > 3:
self.assertEqual(actual_args[3], expected[3], f"IPC suffix mismatch at call {i}")
# Verify create flag if present
if len(expected) > 4 and len(actual_args) > 4:
self.assertEqual(actual_args[4], expected[4], f"Create flag mismatch at call {i}")
@patch("fastdeploy.eplb.utils.IPCSignal")
def test_init_eplb_signals_different_suffix(self, mock_ipc_signal):
"""Test init_eplb_signals with different suffix"""
mock_ipc_instance = MagicMock()
mock_ipc_signal.return_value = mock_ipc_instance
ipc_signal_suffix = "999"
init_eplb_signals(self.fd_config, ipc_signal_suffix)
target_suffix = [
"999_dp0",
"999_dp0",
"999_dp0",
"999_dp0",
"999_dp0_tp0",
"999_dp0_tp0",
"999_dp0_tp0",
"999_dp0_tp0",
"999_dp0_tp0",
]
# Verify that suffix is used correctly
for idx, call in enumerate(mock_ipc_signal.call_args_list):
args, kwargs = call
self.assertEqual(kwargs.get("suffix"), target_suffix[idx])
def test_main_function(self):
"""Test the main function at the end of the file"""
# This tests the if __name__ == "__main__" block
with patch("fastdeploy.eplb.utils.RedundantExpertWorkload") as mock_workload:
mock_instance = MagicMock()
mock_instance.load.return_value = ({"test": "data"}, "success")
mock_workload.return_value = mock_instance
# Import and execute the main block
import fastdeploy.eplb.utils as utils_module
# The main block should execute without errors
# We can't easily test the print output, but we can verify the function call
if hasattr(utils_module, "__name__") and utils_module.__name__ == "__main__":
# This would execute the main block
pass
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,345 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import unittest
from dataclasses import asdict
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import numpy as np
from fastdeploy.config import (
CacheConfig,
EPLBConfig,
FDConfig,
ParallelConfig,
SchedulerConfig,
)
from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.eplb.experts_manager import RedundantExpertManager
class TestRedundantExpertManager(unittest.TestCase):
"""Test cases for experts_manager.py"""
def setUp(self):
"""Set up test fixtures"""
# Create mock config objects
max_num_seqs = 2
engine_args = EngineArgs(
max_num_seqs=max_num_seqs,
num_gpu_blocks_override=102,
max_num_batched_tokens=3200,
)
args = asdict(engine_args)
cache_cfg = CacheConfig(args)
model_cfg = SimpleNamespace(enable_mm=True) # Enable multimodal for feature testing
speculative_cfg = SimpleNamespace(method=None)
model_cfg.print = print
model_cfg.max_model_len = 5120
model_cfg.num_hidden_layers = 3
model_cfg.moe_num_experts = 64
model_cfg.moe_layer_start_index = 1
model_cfg.model = "/test/model"
cache_cfg.bytes_per_layer_per_block = 1
parallel_cfg = ParallelConfig(args)
scheduler_cfg = SchedulerConfig(args)
graph_opt_cfg = engine_args.create_graph_optimization_config()
eplb_args = {
"redundant_experts_num": 0,
"redundant_expert_api_user": "test_user",
"redundant_expert_api_password": "test_pass",
"redundant_expert_eplb_strategy": "",
"redundant_expert_ip_shm_size": 1024,
"moe_quant_type": "",
"redundant_expert_enable_schedule_cordon": False,
}
eplb_config = EPLBConfig(eplb_args)
self.fd_config = FDConfig(
model_config=model_cfg,
cache_config=cache_cfg,
parallel_config=parallel_cfg,
graph_opt_config=graph_opt_cfg,
speculative_config=speculative_cfg,
scheduler_config=scheduler_cfg,
eplb_config=eplb_config,
)
self.fd_config.parallel_config.local_data_parallel_id = 0
self.fd_config.splitwise_role = "decode"
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
def test_init(self, mock_thread, mock_process, mock_get_logger):
"""Test RedundantExpertManager initialization"""
# Mock logger
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
# Mock process and thread
mock_process_instance = MagicMock()
mock_process.return_value = mock_process_instance
mock_thread_instance = MagicMock()
mock_thread.return_value = mock_thread_instance
# Test initialization
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Verify initialization
self.assertEqual(manager.rank, 0)
self.assertEqual(manager.ep_size, 32)
self.assertEqual(manager.fd_config, self.fd_config)
self.assertEqual(manager.num_logical_experts, 64)
self.assertEqual(manager.num_replicas, 64) # 64 + 0 redundant
# Verify arrays are created
self.assertEqual(manager.model_ep_rank_to_expert_id_list.shape, (3, 64))
self.assertEqual(manager.model_expert_id_to_ep_rank_array.shape, (3, 64, 1))
self.assertEqual(manager.model_expert_in_rank_num_list.shape, (3, 64))
# Verify process and thread are started
mock_process.assert_called_once()
mock_thread.assert_called_once()
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
def test_init_with_redundant_experts(self, mock_thread, mock_process, mock_get_logger):
"""Test initialization with redundant experts"""
# Set up redundant experts
self.fd_config.eplb_config.redundant_experts_num = 16
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=8, fd_config=self.fd_config, ipc_signal_suffix=0)
# Verify with redundant experts
self.assertEqual(manager.num_replicas, 80) # 64 + 16 redundant
self.assertEqual(manager.model_ep_rank_to_expert_id_list.shape, (3, 80))
self.assertEqual(manager.model_expert_id_to_ep_rank_array.shape, (3, 64, 17)) # 16 redundant + 1
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
def test_get_ep_rank_to_expert_id_list(self, mock_thread, mock_process, mock_get_logger):
"""Test get_ep_rank_to_expert_id_list method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Set some test data
manager.model_ep_rank_to_expert_id_list = np.array([[0, 1, 2, 3]])
manager.model_expert_id_to_ep_rank_array = np.array([[[0], [1], [2], [3]]])
manager.model_expert_in_rank_num_list = np.array([[1, 1, 1, 1]])
result = manager.get_ep_rank_to_expert_id_list()
self.assertEqual(len(result), 3)
np.testing.assert_array_equal(result[0], np.array([[0, 1, 2, 3]]))
np.testing.assert_array_equal(result[1], np.array([[[0], [1], [2], [3]]]))
np.testing.assert_array_equal(result[2], np.array([[1, 1, 1, 1]]))
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
def test_caculate_expert_rank_table(self, mock_thread, mock_process, mock_get_logger):
"""Test caculate_expert_rank_table method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Set up test data
manager.model_tokens_per_expert_stats_list = np.array([[10, 20, 30, 40], [5, 15, 25, 35]])
# Mock the rebalance_experts function
with patch("fastdeploy.eplb.experts_manager.rebalance_experts") as mock_rebalance:
np_array1 = np.random.randint(0, 100, size=(3, 64))
np_array2 = np.random.randint(0, 100, size=(3, 64, 1))
np_array3 = np.random.randint(0, 100, size=(3, 64))
mock_rebalance.return_value = (
np_array1, # phy2log
np_array2, # log2phy
np_array3, # logcnt
)
manager.caculate_expert_rank_table(is_init=True)
# Verify that rebalance_experts was called with correct parameters
mock_rebalance.assert_called_once()
# Verify that arrays are updated
np.testing.assert_array_equal(manager.model_ep_rank_to_expert_id_list, np_array1)
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
@patch("fastdeploy.eplb.experts_manager.IPCSignal")
def test_update_weight_from_disk(self, mock_ipc_signal, mock_thread, mock_process, mock_get_logger):
"""Test update_weight_from_disk method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Mock IPCSignal
mock_ipc_instance = MagicMock()
mock_ipc_signal.return_value = mock_ipc_instance
manager.update_weight_from_disk_result = MagicMock()
# Mock parent connections
manager.parent_mg_conn = MagicMock()
manager.parent_data_conn = MagicMock()
manager.parent_data_conn.recv.return_value = {"result": True, "weights": ["weight1", "weight2"]}
# Set up test data
manager.last_model_ep_rank_to_expert_id_list = np.array([[0, 1, 2, 3]])
manager.model_ep_rank_to_expert_id_list = np.array([[1, 2, 3, 4]])
with patch("time.time", return_value=1000):
manager.update_weight_from_disk()
# Verify that data was sent and received
manager.parent_mg_conn.send.assert_called_once()
manager.parent_data_conn.recv.assert_called_once()
# Verify that tensor_infos was set
self.assertEqual(manager.tensor_infos, ["weight1", "weight2"])
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
@patch("fastdeploy.eplb.experts_manager.requests.post")
def test_allgather_expert_token_stats(self, mock_requests, mock_thread, mock_process, mock_get_logger):
"""Test allgather_expert_token_stats method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Set up test addresses
manager.dp_rank_address = ["127.0.0.1:8000", "127.0.0.1:8001"]
# Mock successful responses
mock_response1 = MagicMock()
mock_response1.status_code = 200
mock_response1.json.return_value = {"data": np.random.randint(0, 100, size=(3, 64))} # 2 layers, 2 experts
mock_response2 = MagicMock()
mock_response2.status_code = 200
mock_response2.json.return_value = {"data": np.random.randint(0, 100, size=(3, 64))} # 2 layers, 2 experts
mock_requests.side_effect = [mock_response1, mock_response2]
# Update model config for this test
manager.num_hidden_layers = 3
manager.num_logical_experts = 64
manager.dp_rank_address = []
result = manager.allgather_expert_token_stats()
self.assertTrue(result)
# Verify that stats were accumulated
expected_stats = np.zeros((3, 64))
np.testing.assert_array_equal(manager.model_tokens_per_expert_stats_list, expected_stats)
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
@patch("fastdeploy.eplb.experts_manager.requests.post")
def test_broadcast_expert_token_stats(self, mock_requests, mock_thread, mock_process, mock_get_logger):
"""Test broadcast_expert_token_stats method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Set up test addresses
manager.dp_rank_address = ["127.0.0.1:8000", "127.0.0.1:8001"]
# Mock successful responses
mock_response1 = MagicMock()
mock_response1.status_code = 200
mock_response2 = MagicMock()
mock_response2.status_code = 200
mock_requests.side_effect = [mock_response1, mock_response2]
result = manager.broadcast_expert_token_stats()
self.assertTrue(result)
self.assertEqual(mock_requests.call_count, 2)
@patch("fastdeploy.eplb.experts_manager.get_logger")
@patch("fastdeploy.eplb.experts_manager.Process")
@patch("fastdeploy.eplb.experts_manager.threading.Thread")
@patch("fastdeploy.eplb.experts_manager.requests.post")
def test_allgather_load_weight_result(self, mock_requests, mock_thread, mock_process, mock_get_logger):
"""Test allgather_load_weight_result method"""
mock_logger = MagicMock()
mock_get_logger.return_value = mock_logger
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
# Set up test addresses
manager.dp_rank_address = ["127.0.0.1:8000", "127.0.0.1:8001"]
# Mock successful responses with mixed results
mock_response1 = MagicMock()
mock_response1.status_code = 200
mock_response1.json.return_value = {"data": [1, 1]} # Two successful loads
mock_response2 = MagicMock()
mock_response2.status_code = 200
mock_response2.json.return_value = {"data": [-1, 1]} # One failed, one successful
mock_requests.side_effect = [mock_response1, mock_response2]
all_success, exist_fail = manager.allgather_load_weight_result()
self.assertFalse(all_success) # Not all successful due to failure
self.assertTrue(exist_fail) # There is a failure
def test_edge_cases(self):
"""Test edge cases"""
# Test with empty addresses
with (
patch("fastdeploy.eplb.experts_manager.get_logger"),
patch("fastdeploy.eplb.experts_manager.Process"),
patch("fastdeploy.eplb.experts_manager.threading.Thread"),
):
manager = RedundantExpertManager(rank=0, ep_size=32, fd_config=self.fd_config, ipc_signal_suffix=0)
manager.dp_rank_address = []
# Test allgather with empty addresses
result = manager.allgather_expert_token_stats()
self.assertTrue(result)
manager.dp_rank_address = []
# Test broadcast with empty addresses
result = manager.broadcast_expert_token_stats()
self.assertTrue(result) # Should return True for empty list
if __name__ == "__main__":
unittest.main()