mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[MTP] optimize mtp infer speed (#2840)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
This commit is contained in:
@@ -246,7 +246,7 @@ void token_penalty_multi_scores_kernel(
|
||||
max_seq_len);
|
||||
}
|
||||
|
||||
void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
void SpecTokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
const paddle::Tensor &logits,
|
||||
const paddle::Tensor &penalty_scores,
|
||||
const paddle::Tensor &frequency_scores,
|
||||
@@ -338,4 +338,4 @@ PD_BUILD_STATIC_OP(speculate_get_token_penalty_multi_scores)
|
||||
.Outputs({"logits_out"})
|
||||
.Attrs({"max_seq_len: int"})
|
||||
.SetInplaceMap({{"logits", "logits_out"}})
|
||||
.SetKernelFn(PD_KERNEL(TokenPenaltyMultiScores));
|
||||
.SetKernelFn(PD_KERNEL(SpecTokenPenaltyMultiScores));
|
||||
|
@@ -266,18 +266,6 @@ void SpeculateVerify(
|
||||
seed++;
|
||||
offset++;
|
||||
|
||||
auto err = cudaDeviceSynchronize();
|
||||
if (err != 0) {
|
||||
printf("err %d\n", err);
|
||||
}
|
||||
|
||||
err = cudaGetLastError();
|
||||
|
||||
if (err != 0) {
|
||||
printf("err %d\n", err);
|
||||
}
|
||||
|
||||
// printf("inited curand\n");
|
||||
bool use_topk = false;
|
||||
char *env_var = getenv("SPECULATE_VERIFY_USE_TOPK");
|
||||
if (env_var) {
|
||||
|
Reference in New Issue
Block a user