[Feature][MTP]Support new mtp (#3656)

* update multi-draft-token strategy * fix format * support hybrid mtp with ngram speculative decoding method
2025-10-05 08:37:06 +08:00 · 2025-08-27 19:38:26 +08:00
parent 62659a7a73
commit c753f1fc9e
20 changed files with 501 additions and 579 deletions
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
@@ -23,14 +23,7 @@
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
 #endif

-#define MAX_BSZ 256
-#define MAX_DRAFT_TOKENS 6
-
-struct msgdata {
-    long mtype;
-    int mtext[MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ +
-              2];  // stop_flag, bsz, tokens
-};
+#include "speculate_msg.h"

 void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                const paddle::Tensor& accept_num,
@@ -62,7 +55,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
 #endif
        msg_queue_id = inference_msg_queue_id_from_env;
    }
-    static struct msgdata msg_sed;
+    static struct speculate_msgdata msg_sed;
    static key_t key = ftok("./", msg_queue_id);
    static int msgid = msgget(key, IPC_CREAT | 0666);