[TSP] Support qwen3 moe tsp + cudagraph (#4871)

* support qwen3_moe tsp mode * fix * fix * update * update * update * fix * support external_rmsnorm * update * fix
2025-12-24 13:28:13 +08:00 · 2025-11-10 23:37:51 +08:00
parent fb2eb403ab
commit 3dc0ffa46d
28 changed files with 173 additions and 273 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -307,8 +307,8 @@ class ModelConfig:
        Read configuration information from environment variables and update the object's attributes.
        If an attribute is not present or is an empty string in the environment variables, use the default value.
        """
-        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
-        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
+        self.max_stop_seqs_num = envs.FD_MAX_STOP_SEQS_NUM
+        self.stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN

        def reset_config_value(key, value):
            if not hasattr(self, key.lower()):
@@ -548,6 +548,8 @@ class ParallelConfig:
        self.do_profile: bool = False
        # Use internode_ll_two_stage or not
        self.use_internode_ll_two_stage: bool = False
+        # disable sequence parallel moe
+        self.disable_sequence_parallel_moe: bool = False

        self.pod_ip: str = None
        # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
@@ -577,14 +579,14 @@ class ParallelConfig:
        else:
            self.pd_disaggregation_mode = "None"

-        # ep+tp strategy: "all_reduce" or "all_to_all"
-        # all_reduce: qkv_linear + attn + out_linear + allreduce
-        # all_to_all: allgather + qkv_linear + attn + all2all + out_linear
-        self.ep_tp_strategy = envs.FD_EP_TP_STRATEGY
-        assert self.ep_tp_strategy in [
-            "all_reduce",
-            "all_to_all",
-        ], f"FD_EP_TP_STRATEGY: '{self.ep_tp_strategy}' is not supported, only supports 'all_reduce' or 'all_to_all'."
+        # disable_sequence_parallel_moe: qkv_linear + attn + out_linear + allreduce
+        # use_sequence_parallel_moe: allgather + qkv_linear + attn + all2all + out_linear
+        self.use_sequence_parallel_moe = (
+            (not self.disable_sequence_parallel_moe)
+            and self.expert_parallel_size > 1
+            and self.tensor_parallel_size > 1
+        )
+        logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")

    def set_communicate_group(self):
        # different tp group id