diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index c8e1f92e9..2fb823db5 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -567,6 +567,7 @@ class SpeculativeSampler(nn.Layer):
         max_model_len: int,
         share_inputs: List[paddle.Tensor],
         accept_all_drafts: bool = False,
+        reject_all_drafts: bool = False,
     ) -> paddle.Tensor:
         """ """
 
@@ -622,7 +623,7 @@ class SpeculativeSampler(nn.Layer):
             max_model_len,
             self.speculative_verify_window,
             True,  # enable_topp
-            self.speculative_benchmark_mode,
+            (self.speculative_benchmark_mode or reject_all_drafts),
             accept_all_drafts,
         )
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index db56abf57..40a76e4c5 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1334,6 +1334,8 @@ class GPUModelRunner(ModelRunnerBase):
         # TODO(wanglongzhi):Modifying the config at runtime is not appropriate; it needs to be moved to forward_meta. It will be used in MoEMethodBase.apply()
         if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed":
             self.fd_config.model_config.moe_phase.phase = "decode" if if_only_decode else "prefill"
+            if self.speculative_decoding:
+                self.proposer.fd_config.parallel_config.moe_phase.phase = "decode" if if_only_decode else "prefill"
 
         # Update Batch type for cuda graph for only_prefill_batch
         only_prefill_use_cudagraph = self.use_cudagraph and self.cudagraph_only_prefill and self.only_prefill()
@@ -1572,6 +1574,8 @@ class GPUModelRunner(ModelRunnerBase):
         self,
         hidden_states: paddle.Tensor,
         model_output: paddle.Tensor,
+        accept_all_drafts=False,
+        reject_all_drafts=False,
     ) -> paddle.Tensor:
         logits = self.model.compute_logits(hidden_states)
 
@@ -1598,6 +1602,8 @@ class GPUModelRunner(ModelRunnerBase):
                 self.sampling_metadata,
                 self.model_config.max_model_len,
                 self.share_inputs,
+                accept_all_drafts,
+                reject_all_drafts,
             )
             sampler_output = None
             if self.parallel_config.tensor_parallel_size > 1:
@@ -1679,6 +1685,7 @@ class GPUModelRunner(ModelRunnerBase):
         in_capturing: bool = False,
         capture_prefill: bool = False,
         accept_all_drafts: bool = False,
+        reject_all_drafts: bool = False,
     ) -> paddle.Tensor:
         """
         Use dummy inputs to run before formal execution.
@@ -1688,6 +1695,7 @@ class GPUModelRunner(ModelRunnerBase):
             in_capturing: Is cuda graph in capturing state
             capture_prefill: Capture pure prefill for cuda graph
             accept_all_drafts: Target model will accept all draft tokens
+            reject_all_drafts: Target model will reject all draft tokens
         """
 
         input_length_list, max_dec_len_list, block_num = self.get_input_length_list(
@@ -1747,7 +1755,7 @@ class GPUModelRunner(ModelRunnerBase):
                 self._dummy_pooler_run(hidden_states)
                 break
             else:
-                self._dummy_sampler_run(hidden_states, model_output)
+                self._dummy_sampler_run(hidden_states, model_output, accept_all_drafts, reject_all_drafts)
 
             # 7. Updata 'infer_seed' and step_cuda()
             self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
@@ -1891,6 +1899,7 @@ class GPUModelRunner(ModelRunnerBase):
                         in_capturing=True,
                         expected_decode_len=3,
                         accept_all_drafts=False,
+                        reject_all_drafts=True,
                     )
                     logger.info(f"Warm up the Draft model with the num_tokens:{batch_size}, expected_decode_len:{3}")