[TSP] Support qwen3 moe tsp + cudagraph (#4871)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* support qwen3_moe tsp mode

* fix

* fix

* update

* update

* update

* fix

* support external_rmsnorm

* update

* fix
This commit is contained in:
Yuanle Liu
2025-11-10 23:37:51 +08:00
committed by GitHub
parent fb2eb403ab
commit 3dc0ffa46d
28 changed files with 173 additions and 273 deletions

View File

@@ -307,8 +307,8 @@ class ModelConfig:
Read configuration information from environment variables and update the object's attributes.
If an attribute is not present or is an empty string in the environment variables, use the default value.
"""
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
self.max_stop_seqs_num = envs.FD_MAX_STOP_SEQS_NUM
self.stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
def reset_config_value(key, value):
if not hasattr(self, key.lower()):
@@ -548,6 +548,8 @@ class ParallelConfig:
self.do_profile: bool = False
# Use internode_ll_two_stage or not
self.use_internode_ll_two_stage: bool = False
# disable sequence parallel moe
self.disable_sequence_parallel_moe: bool = False
self.pod_ip: str = None
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
@@ -577,14 +579,14 @@ class ParallelConfig:
else:
self.pd_disaggregation_mode = "None"
# ep+tp strategy: "all_reduce" or "all_to_all"
# all_reduce: qkv_linear + attn + out_linear + allreduce
# all_to_all: allgather + qkv_linear + attn + all2all + out_linear
self.ep_tp_strategy = envs.FD_EP_TP_STRATEGY
assert self.ep_tp_strategy in [
"all_reduce",
"all_to_all",
], f"FD_EP_TP_STRATEGY: '{self.ep_tp_strategy}' is not supported, only supports 'all_reduce' or 'all_to_all'."
# disable_sequence_parallel_moe: qkv_linear + attn + out_linear + allreduce
# use_sequence_parallel_moe: allgather + qkv_linear + attn + all2all + out_linear
self.use_sequence_parallel_moe = (
(not self.disable_sequence_parallel_moe)
and self.expert_parallel_size > 1
and self.tensor_parallel_size > 1
)
logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")
def set_communicate_group(self):
# different tp group id