1.fix the bug of draft model with ep 2.fix sampler bug (#4589)

This commit is contained in:
RAM
2025-10-27 17:47:34 +08:00
committed by GitHub
parent 8aab4e367f
commit 25a983ba9c
2 changed files with 12 additions and 2 deletions

View File

@@ -567,6 +567,7 @@ class SpeculativeSampler(nn.Layer):
max_model_len: int,
share_inputs: List[paddle.Tensor],
accept_all_drafts: bool = False,
reject_all_drafts: bool = False,
) -> paddle.Tensor:
""" """
@@ -622,7 +623,7 @@ class SpeculativeSampler(nn.Layer):
max_model_len,
self.speculative_verify_window,
True, # enable_topp
self.speculative_benchmark_mode,
(self.speculative_benchmark_mode or reject_all_drafts),
accept_all_drafts,
)

View File

@@ -1334,6 +1334,8 @@ class GPUModelRunner(ModelRunnerBase):
# TODO(wanglongzhi):Modifying the config at runtime is not appropriate; it needs to be moved to forward_meta. It will be used in MoEMethodBase.apply()
if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed":
self.fd_config.model_config.moe_phase.phase = "decode" if if_only_decode else "prefill"
if self.speculative_decoding:
self.proposer.fd_config.parallel_config.moe_phase.phase = "decode" if if_only_decode else "prefill"
# Update Batch type for cuda graph for only_prefill_batch
only_prefill_use_cudagraph = self.use_cudagraph and self.cudagraph_only_prefill and self.only_prefill()
@@ -1572,6 +1574,8 @@ class GPUModelRunner(ModelRunnerBase):
self,
hidden_states: paddle.Tensor,
model_output: paddle.Tensor,
accept_all_drafts=False,
reject_all_drafts=False,
) -> paddle.Tensor:
logits = self.model.compute_logits(hidden_states)
@@ -1598,6 +1602,8 @@ class GPUModelRunner(ModelRunnerBase):
self.sampling_metadata,
self.model_config.max_model_len,
self.share_inputs,
accept_all_drafts,
reject_all_drafts,
)
sampler_output = None
if self.parallel_config.tensor_parallel_size > 1:
@@ -1679,6 +1685,7 @@ class GPUModelRunner(ModelRunnerBase):
in_capturing: bool = False,
capture_prefill: bool = False,
accept_all_drafts: bool = False,
reject_all_drafts: bool = False,
) -> paddle.Tensor:
"""
Use dummy inputs to run before formal execution.
@@ -1688,6 +1695,7 @@ class GPUModelRunner(ModelRunnerBase):
in_capturing: Is cuda graph in capturing state
capture_prefill: Capture pure prefill for cuda graph
accept_all_drafts: Target model will accept all draft tokens
reject_all_drafts: Target model will reject all draft tokens
"""
input_length_list, max_dec_len_list, block_num = self.get_input_length_list(
@@ -1747,7 +1755,7 @@ class GPUModelRunner(ModelRunnerBase):
self._dummy_pooler_run(hidden_states)
break
else:
self._dummy_sampler_run(hidden_states, model_output)
self._dummy_sampler_run(hidden_states, model_output, accept_all_drafts, reject_all_drafts)
# 7. Updata 'infer_seed' and step_cuda()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
@@ -1891,6 +1899,7 @@ class GPUModelRunner(ModelRunnerBase):
in_capturing=True,
expected_decode_len=3,
accept_all_drafts=False,
reject_all_drafts=True,
)
logger.info(f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}")