[Graph Optimization][Speculative Decoding] Fix the bug of CUDAGraph + MTP + EP (#4430)

* Fix MTP dummy run bug * Target Model and Draft Model using the same flag * aovid moe bug in cudagraph padding * In mtp replace use_cudagraph as step_use_cudagraph
2025-11-01 04:12:58 +08:00 · 2025-10-17 14:22:05 +08:00
parent cfd93c0966
commit 920df5be5a
3 changed files with 29 additions and 33 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1253,7 +1253,9 @@ class GPUModelRunner(ModelRunnerBase):

            if self.speculative_decoding:
                if self.speculative_method == "mtp":
-                    self.proposer.run(full_hidden_states=model_output)
+                    self.proposer.run(
+                        full_hidden_states=model_output, step_use_cudagraph=self.forward_meta.step_use_cudagraph
+                    )
                else:
                    self.proposer.run(share_inputs=self.share_inputs)

@@ -1600,7 +1602,9 @@ class GPUModelRunner(ModelRunnerBase):
        # 6. Speculative decode
        if self.speculative_decoding:
            if self.speculative_method == "mtp":
-                self.proposer.run(full_hidden_states=model_output)
+                self.proposer.run(
+                    full_hidden_states=model_output, step_use_cudagraph=self.forward_meta.step_use_cudagraph
+                )
            else:
                self.proposer.run(share_inputs=self.share_inputs)