[Executor] Adjust signal sending order in RL training (#3773) (#4066)

* Adjust processing order

* fix bug

* fix update_parameters bug

* refine code
This commit is contained in:
RAM
2025-09-11 15:41:32 +08:00
committed by GitHub
parent 48f2ab3fb3
commit 63d24b2210
3 changed files with 20 additions and 22 deletions

View File

@@ -44,6 +44,7 @@ class DynamicWeightManager:
self.model: nn.Layer = model
self._capture_model_state()
self.update_parameters()
self.finalize_update()
logger.info(
f"✅ DynamicLoad model built successfully by {self.load_config.load_strategy}, "
@@ -79,8 +80,6 @@ class DynamicWeightManager:
logger.info(f"Update parameters in {time.perf_counter()-start_time:.2f}s")
self._finalize_update(pid)
def _update_ipc_snapshot(self):
"""Update using IPC snapshot strategy for elastic recovery."""
model_path = os.path.join(
@@ -144,7 +143,7 @@ class DynamicWeightManager:
if src.shape != dst.shape:
raise ValueError(f"Shape mismatch for {name}: {src.shape} vs {dst.shape}")
def _finalize_update(self, pid: int):
def finalize_update(self, pid: int = 0):
"""Finalize update process with verification."""
self._verify_parameters("update")
if self.parallel_config.tensor_parallel_size > 1: