fix eplb weight updating (#5529) (#5661)

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
RichardWooSJTU
2025-12-23 16:09:05 +08:00
committed by GitHub
parent cfddec7142
commit 1b74540820

View File

@@ -443,6 +443,9 @@ class RedundantExpertManager:
self.logger.info("redundant_expert: allreduce_load_weight_result waiting")
return False
# self.broadcast_load_weight_success()
self.logger.info(
f"redundant_expert_enable_schedule_cordon {self.eplb_config.redundant_expert_enable_schedule_cordon}"
)
if not exist_fail and all_success:
# prefill需要等待调度屏蔽
if (
@@ -460,7 +463,7 @@ class RedundantExpertManager:
"""
all_success, exist_fail = False, False
success_count, fail_count = 0, 0
success_count, fail_count, unfinish_count = 0, 0, 0
for addr in self.dp_rank_address:
try:
params = {
@@ -492,6 +495,11 @@ class RedundantExpertManager:
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
)
exist_fail = True
elif result == 0:
unfinish_count += 1
self.logger.debug(
f"edundant_expert: allgather_load_weight_result unfinish. addr {addr}, result {result}"
)
except Exception as e:
self.logger.error(f"redundant_expert: allgather_load_weight_result error. addr {addr}, error {e}")
@@ -500,6 +508,11 @@ class RedundantExpertManager:
"redundant_expert: allgather_load_weight_result not all ready, "
+ f"succ {success_count} fail {fail_count} total {len(self.dp_rank_address)}"
)
elif unfinish_count > 0:
self.logger.info(
"redundant_expert: allgather_load_weight_result not all ready, "
+ f"succ {success_count} fail {fail_count} unfinish_count {unfinish_count} total {len(self.dp_rank_address)}"
)
else:
self.logger.info("redundant_expert: allgather_load_weight_result all success")
all_success = True