[Feature] support min_p_sampling (#2872)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* Fastdeploy support min_p

* add test_min_p

* fix

* min_p_sampling

* update

* delete vl_gpu_model_runner.py

* fix

* Align usage of min_p with vLLM

* fix

* modified unit test

* fix test_min_sampling

* pre-commit all files

* fix

* fix

* fix

* fix xpu_model_runner.py
This commit is contained in:
lizexu123
2025-07-21 14:17:59 +08:00
committed by GitHub
parent 95a214ae43
commit 67990e0572
15 changed files with 302 additions and 1 deletions

View File

@@ -42,6 +42,7 @@ class SamplingMetadata:
top_p: paddle.Tensor
top_k: Optional[paddle.Tensor] = None
min_p: Optional[paddle.Tensor] = None
max_num_logprobs: Optional[int] = None
prompt_ids: Optional[paddle.Tensor] = None
prompt_lens: Optional[paddle.Tensor] = None

View File

@@ -18,10 +18,11 @@ from .apply_penalty_multi_scores import (
apply_penalty_multi_scores,
apply_speculative_penalty_multi_scores,
)
from .top_k_top_p_sampling import top_k_top_p_sampling
from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
__all__ = [
"apply_penalty_multi_scores",
"apply_speculative_penalty_multi_scores",
"top_k_top_p_sampling",
"min_p_sampling",
]

View File

@@ -60,6 +60,7 @@ def top_k_top_p_sampling(
"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
if top_p_class == "air":
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
elif top_p_class == "rejection":
@@ -154,3 +155,25 @@ def rejection_top_p_sampling(
except ImportError:
raise RuntimeError("Cannot import rejection_top_p_sampling op.")
return ids
def min_p_sampling(
probs: paddle.tensor,
min_p_arr: Optional[paddle.Tensor],
) -> tuple[paddle.Tensor, paddle.Tensor]:
"""
min_p_sampling
"""
if paddle.count_nonzero(min_p_arr) == 0:
return probs
else:
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import min_p_sampling
probs = min_p_sampling(probs, min_p_arr)
else:
max_probabilities = paddle.amax(probs, axis=-1, keepdim=True)
adjusted_min_p = max_probabilities * min_p_arr
invalid_token_mask = probs < adjusted_min_p.reshape([-1, 1])
probs = paddle.where(invalid_token_mask, paddle.full_like(probs, 0.0), probs)
return probs

View File

@@ -30,6 +30,7 @@ from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.ops import (
apply_penalty_multi_scores,
apply_speculative_penalty_multi_scores,
min_p_sampling,
top_k_top_p_sampling,
)
from fastdeploy.platforms import current_platform
@@ -266,6 +267,8 @@ class Sampler(nn.Layer):
probs = F.softmax(logits)
probs = min_p_sampling(probs, sampling_metadata.min_p)
_, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k)
logprobs_tensors = (
@@ -281,6 +284,7 @@ class Sampler(nn.Layer):
sampled_token_ids=next_tokens,
logprobs_tensors=logprobs_tensors,
)
return sampler_output