polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""""
""" "
sample
"""

View File

@@ -15,7 +15,9 @@
"""
from .apply_penalty_multi_scores import (
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores)
apply_penalty_multi_scores,
apply_speculative_penalty_multi_scores,
)
from .top_k_top_p_sampling import top_k_top_p_sampling
__all__ = [

View File

@@ -37,8 +37,8 @@ def apply_penalty_multi_scores(
apply_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
get_token_penalty_multi_scores
from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
pre_token_ids,
prompt_ids,
@@ -54,8 +54,8 @@ def apply_penalty_multi_scores(
eos_token_ids,
)
elif current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import \
get_token_penalty_multi_scores
from fastdeploy.model_executor.ops.xpu import get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
pre_token_ids,
logits,
@@ -69,8 +69,10 @@ def apply_penalty_multi_scores(
eos_token_ids,
)
elif current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import \
get_token_penalty_multi_scores
from fastdeploy.model_executor.ops.iluvatar import (
get_token_penalty_multi_scores,
)
logits = get_token_penalty_multi_scores(
pre_token_ids,
prompt_ids,
@@ -86,8 +88,8 @@ def apply_penalty_multi_scores(
eos_token_ids,
)
elif current_platform.is_gcu():
from fastdeploy.model_executor.ops.gcu import \
get_token_penalty_multi_scores
from fastdeploy.model_executor.ops.gcu import get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores(
pre_token_ids,
logits,
@@ -101,7 +103,7 @@ def apply_penalty_multi_scores(
eos_token_ids,
)
else:
raise NotImplementedError()
raise NotImplementedError
return logits
@@ -126,8 +128,9 @@ def apply_speculative_penalty_multi_scores(
apply_speculative_penalty_multi_scores
"""
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
speculate_get_token_penalty_multi_scores
from fastdeploy.model_executor.ops.gpu import (
speculate_get_token_penalty_multi_scores,
)
speculate_get_token_penalty_multi_scores(
pre_token_ids,
@@ -146,6 +149,6 @@ def apply_speculative_penalty_multi_scores(
max_len,
)
else:
raise NotImplementedError()
raise NotImplementedError
# inplace
return logits

View File

@@ -22,8 +22,8 @@ from fastdeploy import envs
from fastdeploy.platforms import current_platform
if current_platform.is_gcu():
from fastdeploy.model_executor.ops.gcu import \
top_p_sampling as gcu_top_p_sampling
from fastdeploy.model_executor.ops.gcu import top_p_sampling as gcu_top_p_sampling
def top_k_top_p_sampling(
x: paddle.Tensor,
@@ -33,8 +33,8 @@ def top_k_top_p_sampling(
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
order: Literal['top_k_first', 'joint'] = "top_k_first",
mode: Literal["truncated", "non-truncated"] = "truncated",
order: Literal["top_k_first", "joint"] = "top_k_first",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
x(Tensor): An input 2-D Tensor with type float32, float16 and bfloat16.
@@ -61,35 +61,33 @@ def top_k_top_p_sampling(
"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
if top_p_class == "air":
_, ids = air_top_p_sampling(x,
top_p,
threshold,
topp_seed,
seed=seed,
k=k,
mode=mode)
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
elif top_p_class == "rejection":
ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
_ = None
elif top_p_class == "base_non_truncated":
_, ids = paddle.tensor.top_p_sampling(x,
top_p,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated")
_, ids = paddle.tensor.top_p_sampling(
x,
top_p,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated",
)
else:
if current_platform.is_gcu():
_, ids = gcu_top_p_sampling(x, top_p)
else:
_, ids = paddle.tensor.top_p_sampling(x,
top_p,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="truncated")
_, ids = paddle.tensor.top_p_sampling(
x,
top_p,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="truncated",
)
return _, ids
@@ -100,15 +98,15 @@ def air_top_p_sampling(
topp_seed: Optional[paddle.Tensor] = None,
seed: int = -1,
k: int = 0,
mode: Literal['truncated', 'non-truncated'] = "truncated",
mode: Literal["truncated", "non-truncated"] = "truncated",
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
air_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import air_top_p_sampling
out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k,
mode)
out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k, mode)
except ImportError:
raise RuntimeError("Cannot import air_top_p_sampling op.")
return out, ids
@@ -119,14 +117,16 @@ def rejection_top_p_sampling(
top_p: paddle.Tensor,
top_k: paddle.Tensor,
seed: int = -1,
order: Literal['top_k_first', 'joint'] = "top_k_first",
order: Literal["top_k_first", "joint"] = "top_k_first",
) -> paddle.Tensor:
"""
rejection_top_p_sampling
"""
try:
from fastdeploy.model_executor.ops.gpu import (
rejection_top_p_sampling, top_k_renorm_probs)
rejection_top_p_sampling,
top_k_renorm_probs,
)
if paddle.count_nonzero(top_k) == 0:
ids = rejection_top_p_sampling(

View File

@@ -13,21 +13,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import threading
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import nn
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \
LogitsProcessorBase
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
LogitsProcessorBase,
)
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.ops import (
apply_penalty_multi_scores, apply_speculative_penalty_multi_scores,
top_k_top_p_sampling)
apply_penalty_multi_scores,
apply_speculative_penalty_multi_scores,
top_k_top_p_sampling,
)
from fastdeploy.platforms import current_platform
from fastdeploy.worker.output import LogprobsTensors, SamplerOutput
@@ -44,11 +48,13 @@ class SamplerProcessor:
self.executor = ThreadPoolExecutor()
self.logits_lock = threading.Lock()
def add_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" add logits processor to SamplerProcessor """
def add_logits_processor(
self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = [],
):
"""add logits processor to SamplerProcessor"""
with self.logits_lock:
if future is None:
if ids in self.logits_processor:
@@ -67,7 +73,7 @@ class SamplerProcessor:
self.logits_processor[ids] = [future, prefill_tokens]
def update_vocab_mask(self, skip_idx_list: List[int] = []):
""" update vocab mask. (cpu-heavy operation) """
"""update vocab mask. (cpu-heavy operation)"""
if len(self.logits_processor) == 0:
return
@@ -102,10 +108,8 @@ class SamplerProcessor:
processor.fill_token_bitmask(self.token_bitmask, idx)
def apply_token_mask(self,
logits: paddle.Tensor,
skip_idx_list: List[int] = []):
""" apply token mask to logits """
def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = []):
"""apply token mask to logits"""
if len(self.logits_processor) == 0 or self.token_bitmask is None:
return logits
@@ -121,26 +125,20 @@ class SamplerProcessor:
indices = list(self.logits_processor.keys())
mask_idx = [i for i in indices if i not in skip_idx_list]
return available_processors.apply_token_mask(logits,
self.token_bitmask,
indices=mask_idx)
return available_processors.apply_token_mask(logits, self.token_bitmask, indices=mask_idx)
def _accept_token(self, idx: int, token: int):
""" accept token """
"""accept token"""
if idx not in self.logits_processor:
raise ValueError(
f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}"
)
raise ValueError(f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}")
if self.logits_processor[idx].is_terminated():
return
self.logits_processor[idx].accept_token(token)
def update_output_tokens(self,
next_tokens: paddle.Tensor,
skip_idx_list: List[int] = []):
""" update output tokens """
def update_output_tokens(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []):
"""update output tokens"""
if len(self.logits_processor) == 0:
return
@@ -148,14 +146,13 @@ class SamplerProcessor:
with self.logits_lock:
for idx in self.logits_processor.keys():
token = token_ids[idx][0]
if token < 0 or self.logits_processor[
idx] is None or idx in skip_idx_list:
if token < 0 or self.logits_processor[idx] is None or idx in skip_idx_list:
continue
self._accept_token(idx, token)
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
"""pre process before running"""
# create async operation for guided decoding
# TODO: support async
self.update_vocab_mask(skip_idx_list)
@@ -168,31 +165,35 @@ class Sampler(nn.Layer):
"""
def __init__(self):
"""
"""
""" """
super().__init__()
if current_platform.is_cuda() or current_platform.is_xpu(
) or current_platform.is_iluvatar() or current_platform.is_gcu():
if (
current_platform.is_cuda()
or current_platform.is_xpu()
or current_platform.is_iluvatar()
or current_platform.is_gcu()
):
self.forward = self.forward_cuda
else:
raise NotImplementedError()
raise NotImplementedError
self.processor = SamplerProcessor()
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
def apply_logits_processor(
self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = [],
):
"""apply logits processor to sampler"""
self.processor.add_logits_processor(ids, future, prefill_tokens)
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
"""pre process before running"""
self.processor.pre_process(skip_idx_list)
def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor:
"""
"""
""" """
return F.log_softmax(logits, axis=-1)
def gather_logprobs(
@@ -226,9 +227,7 @@ class Sampler(nn.Layer):
if num_logprobs >= 1:
# Find the topK values.
topk_logprobs, topk_indices = paddle.topk(logprobs,
num_logprobs,
axis=-1)
topk_logprobs, topk_indices = paddle.topk(logprobs, num_logprobs, axis=-1)
indices = paddle.concat([token_ids, topk_indices], axis=1)
top_logprobs = paddle.concat([token_logprobs, topk_logprobs], axis=1)
else:
@@ -243,8 +242,7 @@ class Sampler(nn.Layer):
sampling_metadata: SamplingMetadata,
skip_idx_list: List[int] = [],
) -> SamplerOutput:
"""
"""
""" """
num_logprobs = sampling_metadata.max_num_logprobs
if num_logprobs is not None:
raw_logprobs = self.compute_logprobs(logits)
@@ -270,8 +268,9 @@ class Sampler(nn.Layer):
_, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k)
logprobs_tensors = None if num_logprobs is None else \
self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
logprobs_tensors = (
None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
)
self.processor.update_output_tokens(next_tokens, skip_idx_list)
@@ -291,26 +290,27 @@ class SpeculativeSampler(nn.Layer):
"""
def __init__(self, fd_config: FDConfig):
"""
"""
""" """
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
else:
raise NotImplementedError()
raise NotImplementedError
self.speculative_verify_window = fd_config.speculative_config.verify_window
self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len
self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
"""pre process before running"""
pass
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
def apply_logits_processor(
self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = [],
):
"""apply logits processor to sampler"""
pass
def forward_cuda(
@@ -320,11 +320,9 @@ class SpeculativeSampler(nn.Layer):
max_model_len: int,
share_inputs: List[paddle.Tensor],
) -> paddle.Tensor:
"""
"""
""" """
from fastdeploy.model_executor.ops.gpu import (speculate_verify,
top_p_candidates)
from fastdeploy.model_executor.ops.gpu import speculate_verify, top_p_candidates
logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
@@ -361,7 +359,8 @@ class SpeculativeSampler(nn.Layer):
share_inputs["seq_lens_encoder"],
share_inputs["seq_lens_decoder"],
share_inputs[
"draft_tokens"], # Both input and output, need to write the last 1 token accepted to position 0.
"draft_tokens"
], # Both input and output, need to write the last 1 token accepted to position 0.
share_inputs["seq_lens_this_time"],
verify_tokens,
verify_scores,
@@ -382,27 +381,27 @@ class SpeculativeSampler(nn.Layer):
class MTPSampler(nn.Layer):
"""
"""
""" """
def __init__(self, fd_config: FDConfig):
"""
"""
""" """
super().__init__()
if current_platform.is_cuda():
self.forward = self.forward_cuda
else:
raise NotImplementedError()
raise NotImplementedError
def pre_process(self, skip_idx_list: List[int] = []):
""" pre process before running """
"""pre process before running"""
pass
def apply_logits_processor(self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = []):
""" apply logits processor to sampler """
def apply_logits_processor(
self,
ids: int,
future: Optional[Any] = None,
prefill_tokens: List[int] = [],
):
"""apply logits processor to sampler"""
pass
def forward_cuda(
@@ -412,8 +411,7 @@ class MTPSampler(nn.Layer):
max_model_len: int,
share_inputs: List[paddle.Tensor],
) -> paddle.Tensor:
"""
"""
""" """
logits = apply_speculative_penalty_multi_scores(
sampling_metadata.pre_token_ids,
logits,