Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -14,8 +14,12 @@
# limitations under the License.
"""
from .apply_penalty_multi_scores import apply_penalty_multi_scores
from .apply_penalty_multi_scores import (
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
from .top_p_sampling import top_p_sampling
__all__ = [
"apply_penalty_multi_scores",
"apply_speculative_penalty_multi_scores",
"top_p_sampling",
]

View File

@@ -20,7 +20,7 @@ from fastdeploy.platforms import current_platform
def apply_penalty_multi_scores(
prompt_token_ids: paddle.Tensor,
pre_token_ids: paddle.Tensor,
logits: paddle.Tensor,
repetition_penalties: paddle.Tensor,
frequency_penalties: paddle.Tensor,
@@ -30,16 +30,30 @@ def apply_penalty_multi_scores(
step_idx: paddle.Tensor,
min_dec_lens: paddle.Tensor,
eos_token_ids: paddle.Tensor,
):
) -> paddle.Tensor:
"""
Args:
Returns:
apply_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
prompt_token_ids,
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
)
elif current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import \
get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
@@ -54,3 +68,48 @@ def apply_penalty_multi_scores(
raise NotImplementedError()
return logits
def apply_speculative_penalty_multi_scores(
pre_token_ids: paddle.Tensor,
logits: paddle.Tensor,
repetition_penalties: paddle.Tensor,
frequency_penalties: paddle.Tensor,
presence_penalties: paddle.Tensor,
temperature: paddle.Tensor,
bad_words_token_ids: paddle.Tensor,
step_idx: paddle.Tensor,
min_dec_lens: paddle.Tensor,
eos_token_ids: paddle.Tensor,
seq_lens_this_time: paddle.Tensor,
output_padding_offset: paddle.Tensor,
output_cum_offsets: paddle.Tensor,
max_len: int,
):
"""
apply_speculative_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
speculate_get_token_penalty_multi_scores
logits = speculate_get_token_penalty_multi_scores(
pre_token_ids,
logits,
repetition_penalties,
frequency_penalties,
presence_penalties,
temperature,
bad_words_token_ids,
step_idx,
min_dec_lens,
eos_token_ids,
seq_lens_this_time,
output_padding_offset,
output_cum_offsets,
max_len,
)
else:
raise NotImplementedError()
return logits

View File

@@ -0,0 +1,97 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Literal, Optional
import paddle
from fastdeploy import envs
def top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
threshold: Optional[paddle.Tensor] = None,
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
top_p_sampling
"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
if top_p_class == "air":
_, ids = air_top_p_sampling(x,
ps,
threshold,
topp_seed,
seed=seed,
k=k,
mode=mode)
elif top_p_class == "rejection":
ids = rejection_top_p_sampling(x, ps, seed)
_ = None
else:
_, ids = paddle.tensor.top_p_sampling(x,
ps,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode=mode)
return _, ids
def air_top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
threshold: Optional[paddle.Tensor] = None,
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
air_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
out, ids = air_top_p_sampling(x, ps, threshold, topp_seed, seed, k,
mode)
except ImportError:
raise RuntimeError("Cannot import air_top_p_sampling op.")
return out, ids
def rejection_top_p_sampling(
x: paddle.Tensor,
ps: paddle.Tensor,
seed: int = -1,
) -> paddle.Tensor:
"""
rejection_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling
ids = rejection_top_p_sampling(
x,
ps,
seed,
)
except ImportError:
raise RuntimeError("Cannot import rejection_top_p_sampling op.")
return ids