mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[TSP] Support qwen3 moe tsp + cudagraph (#4871)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* support qwen3_moe tsp mode * fix * fix * update * update * update * fix * support external_rmsnorm * update * fix
This commit is contained in:
@@ -307,8 +307,8 @@ class ModelConfig:
|
||||
Read configuration information from environment variables and update the object's attributes.
|
||||
If an attribute is not present or is an empty string in the environment variables, use the default value.
|
||||
"""
|
||||
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
||||
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
||||
self.max_stop_seqs_num = envs.FD_MAX_STOP_SEQS_NUM
|
||||
self.stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
|
||||
|
||||
def reset_config_value(key, value):
|
||||
if not hasattr(self, key.lower()):
|
||||
@@ -548,6 +548,8 @@ class ParallelConfig:
|
||||
self.do_profile: bool = False
|
||||
# Use internode_ll_two_stage or not
|
||||
self.use_internode_ll_two_stage: bool = False
|
||||
# disable sequence parallel moe
|
||||
self.disable_sequence_parallel_moe: bool = False
|
||||
|
||||
self.pod_ip: str = None
|
||||
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
|
||||
@@ -577,14 +579,14 @@ class ParallelConfig:
|
||||
else:
|
||||
self.pd_disaggregation_mode = "None"
|
||||
|
||||
# ep+tp strategy: "all_reduce" or "all_to_all"
|
||||
# all_reduce: qkv_linear + attn + out_linear + allreduce
|
||||
# all_to_all: allgather + qkv_linear + attn + all2all + out_linear
|
||||
self.ep_tp_strategy = envs.FD_EP_TP_STRATEGY
|
||||
assert self.ep_tp_strategy in [
|
||||
"all_reduce",
|
||||
"all_to_all",
|
||||
], f"FD_EP_TP_STRATEGY: '{self.ep_tp_strategy}' is not supported, only supports 'all_reduce' or 'all_to_all'."
|
||||
# disable_sequence_parallel_moe: qkv_linear + attn + out_linear + allreduce
|
||||
# use_sequence_parallel_moe: allgather + qkv_linear + attn + all2all + out_linear
|
||||
self.use_sequence_parallel_moe = (
|
||||
(not self.disable_sequence_parallel_moe)
|
||||
and self.expert_parallel_size > 1
|
||||
and self.tensor_parallel_size > 1
|
||||
)
|
||||
logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")
|
||||
|
||||
def set_communicate_group(self):
|
||||
# different tp group id
|
||||
|
||||
Reference in New Issue
Block a user