[Feature] support eplb in api_server (#4782)

* support eplb in api_server * update code * add eplb test case * update eplb * support tp+dp eplb * update test cese * update code * update code * fix bug * update copilot review * update test case name
2025-12-24 13:28:13 +08:00 · 2025-11-24 20:22:29 +08:00
parent d5bd64336a
commit 8e4e3ff510
25 changed files with 2102 additions and 421 deletions
--- a/fastdeploy/eplb/utils.py
+++ b/fastdeploy/eplb/utils.py
@@ -1,9 +1,27 @@
-"""eplb utilities"""
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""

 import json
 import os
 import time
-from enum import Enum
+
+import numpy as np
+
+from fastdeploy.config import FDConfig
+from fastdeploy.inter_communicator import IPCSignal


 class RedundantExpertWorkload:
@@ -47,13 +65,101 @@ class RedundantExpertWorkload:
            return {}, f"redundant_expert: load file {self.meta_file_name} failed, {e}"


-class RearrangeExpertState(Enum):
-    """RearrangeExpertState"""
+def init_eplb_signals(config: FDConfig, ipc_signal_suffix):
+    """
+    Initialize shared memory to indicate eplb status
+    """
+    if config.parallel_config.tensor_parallel_rank != 0:
+        # only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
+        return

-    free = 0
-    doing = 1
-    load_succ = 2  # load weight from disk success
-    done = 3
+    dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{config.parallel_config.local_data_parallel_id}"
+    # rearrange_experts_status Record the expert's rearrangement status
+    rearrange_experts_array = np.zeros([1], dtype=np.int32)
+    _ = IPCSignal(
+        name="rearrange_experts_status",
+        array=rearrange_experts_array,
+        dtype=np.int32,
+        suffix=dp_ipc_signal_suffix,
+        create=True,
+    )
+
+    # Record all DP rank IPs when receiving expert rearrangement requests
+    rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
+    _ = IPCSignal(
+        name="rearrange_experts_ips_size",
+        array=rearrange_experts_ips_size_array,
+        dtype=np.int32,
+        suffix=dp_ipc_signal_suffix,
+        create=True,
+    )
+    _ = IPCSignal(
+        name="rearrange_experts_ips_list",
+        shm_size=config.eplb_config.redundant_expert_ip_shm_size,
+        suffix=dp_ipc_signal_suffix,
+        create=True,
+    )
+
+    # Receive signals for updating weights
+    signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
+    _ = IPCSignal(
+        name="signal_update_weight_from_tensor",
+        array=signal_update_weight_from_tensor,
+        dtype=np.int32,
+        suffix=dp_ipc_signal_suffix,
+        create=True,
+    )
+
+    for rank_id in range(config.parallel_config.tensor_parallel_size):
+        tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{rank_id}"
+        # Record expert workload
+        experts_token_stats = np.zeros(
+            (config.model_config.num_hidden_layers, config.model_config.moe_num_experts),
+            dtype=np.int32,
+        )
+        _ = IPCSignal(
+            name="all_experts_token_stats",
+            array=experts_token_stats,
+            dtype=np.int32,
+            suffix=tp_ipc_signal_suffix,
+            create=True,
+        )
+        _ = IPCSignal(
+            name="local_experts_token_stats",
+            array=experts_token_stats,
+            dtype=np.int32,
+            suffix=tp_ipc_signal_suffix,
+            create=True,
+        )
+
+        # Receive signals for loading weights
+        signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
+        _ = IPCSignal(
+            name="signal_update_weight_from_disk",
+            array=signal_update_weight_from_disk,
+            dtype=np.int32,
+            suffix=tp_ipc_signal_suffix,
+            create=True,
+        )
+
+        # Receive signals for clearing expert loads
+        clear_experts_token_stats = np.zeros([1], dtype=np.int32)
+        _ = IPCSignal(
+            name="signal_clear_experts_token_stats",
+            array=clear_experts_token_stats,
+            dtype=np.int32,
+            suffix=tp_ipc_signal_suffix,
+            create=True,
+        )
+
+        result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
+        _ = IPCSignal(
+            name="result_update_weight_from_disk",
+            array=result_update_weight_from_disk,
+            dtype=np.int32,
+            suffix=tp_ipc_signal_suffix,
+            create=True,
+        )


 if __name__ == "__main__":