[Feature][MTP]Support new mtp (#3656)

* update multi-draft-token strategy * fix format * support hybrid mtp with ngram speculative decoding method
2025-10-06 00:57:33 +08:00 · 2025-08-27 19:38:26 +08:00
parent 62659a7a73
commit c753f1fc9e
20 changed files with 501 additions and 579 deletions
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc
@@ -23,14 +23,7 @@
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif

-#define MAX_BSZ 256
-#define MAX_DRAFT_TOKENS 6
-
-struct msgdata {
-    int64_t mtype;
-    int mtext[MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ +
-              2];  // stop_flag, bsz, accept_num*bsz, tokens...
-};
+#include "speculate_msg.h"

 void SpeculateGetOutput(const paddle::Tensor& x,
                        int64_t rank_id,
@@ -54,7 +47,7 @@ void SpeculateGetOutput(const paddle::Tensor& x,
        msg_queue_id = inference_msg_queue_id_from_env;
    }

-    static struct msgdata msg_rcv;
+    static struct speculate_msgdata msg_rcv;

    static key_t key = ftok("./", msg_queue_id);