mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[Feature][MTP]Support new mtp (#3656)
* update multi-draft-token strategy * fix format * support hybrid mtp with ngram speculative decoding method
This commit is contained in:
@@ -23,14 +23,7 @@
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
#define MAX_BSZ 256
|
||||
#define MAX_DRAFT_TOKENS 6
|
||||
|
||||
struct msgdata {
|
||||
int64_t mtype;
|
||||
int mtext[MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ +
|
||||
2]; // stop_flag, bsz, accept_num*bsz, tokens...
|
||||
};
|
||||
#include "speculate_msg.h"
|
||||
|
||||
void SpeculateGetOutput(const paddle::Tensor& x,
|
||||
int64_t rank_id,
|
||||
@@ -54,7 +47,7 @@ void SpeculateGetOutput(const paddle::Tensor& x,
|
||||
msg_queue_id = inference_msg_queue_id_from_env;
|
||||
}
|
||||
|
||||
static struct msgdata msg_rcv;
|
||||
static struct speculate_msgdata msg_rcv;
|
||||
|
||||
static key_t key = ftok("./", msg_queue_id);
|
||||
|
||||
|
Reference in New Issue
Block a user