mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 09:31:35 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
@@ -14,8 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .apply_penalty_multi_scores import apply_penalty_multi_scores
|
||||
from .apply_penalty_multi_scores import (
|
||||
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
|
||||
from .top_p_sampling import top_p_sampling
|
||||
|
||||
__all__ = [
|
||||
"apply_penalty_multi_scores",
|
||||
"apply_speculative_penalty_multi_scores",
|
||||
"top_p_sampling",
|
||||
]
|
||||
|
@@ -20,7 +20,7 @@ from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
def apply_penalty_multi_scores(
|
||||
prompt_token_ids: paddle.Tensor,
|
||||
pre_token_ids: paddle.Tensor,
|
||||
logits: paddle.Tensor,
|
||||
repetition_penalties: paddle.Tensor,
|
||||
frequency_penalties: paddle.Tensor,
|
||||
@@ -30,16 +30,30 @@ def apply_penalty_multi_scores(
|
||||
step_idx: paddle.Tensor,
|
||||
min_dec_lens: paddle.Tensor,
|
||||
eos_token_ids: paddle.Tensor,
|
||||
):
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Args:
|
||||
Returns:
|
||||
apply_penalty_multi_scores
|
||||
"""
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
get_token_penalty_multi_scores
|
||||
logits = get_token_penalty_multi_scores(
|
||||
prompt_token_ids,
|
||||
pre_token_ids,
|
||||
logits,
|
||||
repetition_penalties,
|
||||
frequency_penalties,
|
||||
presence_penalties,
|
||||
temperature,
|
||||
bad_words_token_ids,
|
||||
step_idx,
|
||||
min_dec_lens,
|
||||
eos_token_ids,
|
||||
)
|
||||
elif current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.ops.xpu import \
|
||||
get_token_penalty_multi_scores
|
||||
logits = get_token_penalty_multi_scores(
|
||||
pre_token_ids,
|
||||
logits,
|
||||
repetition_penalties,
|
||||
frequency_penalties,
|
||||
@@ -54,3 +68,48 @@ def apply_penalty_multi_scores(
|
||||
raise NotImplementedError()
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
def apply_speculative_penalty_multi_scores(
|
||||
pre_token_ids: paddle.Tensor,
|
||||
logits: paddle.Tensor,
|
||||
repetition_penalties: paddle.Tensor,
|
||||
frequency_penalties: paddle.Tensor,
|
||||
presence_penalties: paddle.Tensor,
|
||||
temperature: paddle.Tensor,
|
||||
bad_words_token_ids: paddle.Tensor,
|
||||
step_idx: paddle.Tensor,
|
||||
min_dec_lens: paddle.Tensor,
|
||||
eos_token_ids: paddle.Tensor,
|
||||
seq_lens_this_time: paddle.Tensor,
|
||||
output_padding_offset: paddle.Tensor,
|
||||
output_cum_offsets: paddle.Tensor,
|
||||
max_len: int,
|
||||
):
|
||||
"""
|
||||
apply_speculative_penalty_multi_scores
|
||||
"""
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
speculate_get_token_penalty_multi_scores
|
||||
|
||||
logits = speculate_get_token_penalty_multi_scores(
|
||||
pre_token_ids,
|
||||
logits,
|
||||
repetition_penalties,
|
||||
frequency_penalties,
|
||||
presence_penalties,
|
||||
temperature,
|
||||
bad_words_token_ids,
|
||||
step_idx,
|
||||
min_dec_lens,
|
||||
eos_token_ids,
|
||||
seq_lens_this_time,
|
||||
output_padding_offset,
|
||||
output_cum_offsets,
|
||||
max_len,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
return logits
|
||||
|
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Literal, Optional
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy import envs
|
||||
|
||||
|
||||
def top_p_sampling(
|
||||
x: paddle.Tensor,
|
||||
ps: paddle.Tensor,
|
||||
threshold: Optional[paddle.Tensor] = None,
|
||||
topp_seed: Optional[paddle.Tensor] = None,
|
||||
seed: int = -1,
|
||||
k: int = 0,
|
||||
mode: Literal['truncated', 'non-truncated'] = "truncated",
|
||||
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||
"""
|
||||
top_p_sampling
|
||||
"""
|
||||
top_p_class = envs.FD_SAMPLING_CLASS.lower()
|
||||
if top_p_class == "air":
|
||||
_, ids = air_top_p_sampling(x,
|
||||
ps,
|
||||
threshold,
|
||||
topp_seed,
|
||||
seed=seed,
|
||||
k=k,
|
||||
mode=mode)
|
||||
elif top_p_class == "rejection":
|
||||
ids = rejection_top_p_sampling(x, ps, seed)
|
||||
_ = None
|
||||
else:
|
||||
_, ids = paddle.tensor.top_p_sampling(x,
|
||||
ps,
|
||||
threshold=threshold,
|
||||
topp_seed=topp_seed,
|
||||
seed=seed,
|
||||
k=k,
|
||||
mode=mode)
|
||||
return _, ids
|
||||
|
||||
|
||||
def air_top_p_sampling(
|
||||
x: paddle.Tensor,
|
||||
ps: paddle.Tensor,
|
||||
threshold: Optional[paddle.Tensor] = None,
|
||||
topp_seed: Optional[paddle.Tensor] = None,
|
||||
seed: int = -1,
|
||||
k: int = 0,
|
||||
mode: Literal['truncated', 'non-truncated'] = "truncated",
|
||||
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||
"""
|
||||
air_top_p_sampling
|
||||
"""
|
||||
try:
|
||||
from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
|
||||
out, ids = air_top_p_sampling(x, ps, threshold, topp_seed, seed, k,
|
||||
mode)
|
||||
except ImportError:
|
||||
raise RuntimeError("Cannot import air_top_p_sampling op.")
|
||||
return out, ids
|
||||
|
||||
|
||||
def rejection_top_p_sampling(
|
||||
x: paddle.Tensor,
|
||||
ps: paddle.Tensor,
|
||||
seed: int = -1,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
rejection_top_p_sampling
|
||||
"""
|
||||
try:
|
||||
from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling
|
||||
ids = rejection_top_p_sampling(
|
||||
x,
|
||||
ps,
|
||||
seed,
|
||||
)
|
||||
except ImportError:
|
||||
raise RuntimeError("Cannot import rejection_top_p_sampling op.")
|
||||
return ids
|
Reference in New Issue
Block a user