[Feature] support chunked moe (#4575)

* [Feature] support chunked moe * update * update * fix and add test * update * fix conflict and modity test * fix fused_moe * fix fused_moe * fix docstring * fix * fix typo * fix test * fix * fix * fix test * fix test
2025-12-24 13:28:13 +08:00 · 2025-12-01 15:17:18 +08:00
parent 6f42c37359
commit add524d80c
10 changed files with 405 additions and 5 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -286,6 +286,16 @@ class EngineArgs:
    Enable expert parallelism.
    """

+    enable_chunked_moe: bool = False
+    """
+    Whether use chunked moe.
+    """
+
+    chunked_moe_size: int = 256
+    """
+    Chunk size of moe input.
+    """
+
    cache_transfer_protocol: str = "ipc"
    """
    Protocol to use for cache transfer.
@@ -870,6 +880,18 @@ class EngineArgs:
            default=EngineArgs.eplb_config,
            help="Config of eplb.",
        )
+        parallel_group.add_argument(
+            "--enable-chunked-moe",
+            action="store_true",
+            default=EngineArgs.enable_chunked_moe,
+            help="Use chunked moe.",
+        )
+        parallel_group.add_argument(
+            "--chunked-moe-size",
+            type=int,
+            default=EngineArgs.chunked_moe_size,
+            help="Chunked size of moe input.",
+        )

        # Load group
        load_group = parser.add_argument_group("Load Configuration")