[MTP] optimize mtp infer speed (#2840)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

This commit is contained in:
freeliuzc
2025-07-14 19:50:22 +08:00
committed by GitHub
parent 4c7b8bc458
commit 7cdd8d290d
6 changed files with 253 additions and 24 deletions

View File

@@ -246,7 +246,7 @@ void token_penalty_multi_scores_kernel(
max_seq_len);
}
void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
void SpecTokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
const paddle::Tensor &logits,
const paddle::Tensor &penalty_scores,
const paddle::Tensor &frequency_scores,
@@ -338,4 +338,4 @@ PD_BUILD_STATIC_OP(speculate_get_token_penalty_multi_scores)
.Outputs({"logits_out"})
.Attrs({"max_seq_len: int"})
.SetInplaceMap({{"logits", "logits_out"}})
.SetKernelFn(PD_KERNEL(TokenPenaltyMultiScores));
.SetKernelFn(PD_KERNEL(SpecTokenPenaltyMultiScores));

View File

@@ -266,18 +266,6 @@ void SpeculateVerify(
seed++;
offset++;
auto err = cudaDeviceSynchronize();
if (err != 0) {
printf("err %d\n", err);
}
err = cudaGetLastError();
if (err != 0) {
printf("err %d\n", err);
}
// printf("inited curand\n");
bool use_topk = false;
char *env_var = getenv("SPECULATE_VERIFY_USE_TOPK");
if (env_var) {