[Feature] support min_p_sampling (#2872)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* Fastdeploy support min_p

* add test_min_p

* fix

* min_p_sampling

* update

* delete vl_gpu_model_runner.py

* fix

* Align usage of min_p with vLLM

* fix

* modified unit test

* fix test_min_sampling

* pre-commit all files

* fix

* fix

* fix

* fix xpu_model_runner.py
This commit is contained in:
lizexu123
2025-07-21 14:17:59 +08:00
committed by GitHub
parent 95a214ae43
commit 67990e0572
15 changed files with 302 additions and 1 deletions

View File

@@ -0,0 +1,113 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.nn.functional as F
from fastdeploy.model_executor.ops.gpu import min_p_sampling
class TestMinPSampling(unittest.TestCase):
def setUp(self):
self.sample_time = 1000000
self.vocab_size = 1000
self.min_p_value = 0.5
self.batch_size = 3
self.batch_min_p_values = [0.1, 0.0, 0.9]
self.additional_batch_min_p_values = [0.1, 0.0, 0.3]
# min_p:0.5FastDeploy
def min_p_sampling_cpu(self, min_p):
logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
logits[0][0] = 10
logits[0][1] = 8
low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
logits[0][2:] = low_prob_tensor
probs = F.softmax(logits)
max_probabilities = paddle.amax(probs, axis=-1, keepdim=True)
adjusted_min_p = max_probabilities * min_p.reshape([-1, 1])
invalid_token_mask = probs < adjusted_min_p
probs = paddle.where(invalid_token_mask, paddle.full_like(probs, 0.0), probs)
return probs
# min_p:0.5FastDeploy
def fastdeploy_min_p_sampling(self, min_p):
logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
logits[0][0] = 10
logits[0][1] = 8
low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
logits[0][2:] = low_prob_tensor
probs = F.softmax(logits)
probs = min_p_sampling(probs, min_p)
return probs
# batch:[0.1.0.0,0.9]FastDeploy
def fastdeploy_batch_min_p_sampling(self, batch_size, min_p_values):
logits = paddle.ones(shape=[batch_size, self.vocab_size], dtype="float32")
for b in range(batch_size):
logits[b][0] = 10
logits[b][1] = 8
logits[b][2:] = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
probs = F.softmax(logits, axis=-1)
min_p_arr = paddle.to_tensor(min_p_values, dtype="float32")
probs = min_p_sampling(probs, min_p_arr)
return probs
def compare_results(self, probs, probs_cpu, atol=1e-6, rtol=1e-6):
probs_np = probs.numpy()
probs_cpu_np = probs_cpu.numpy()
try:
np.testing.assert_allclose(
probs_np,
probs_cpu_np,
rtol=rtol,
atol=atol,
)
print("The results are same between fastdeploy_min_p_sampling and min_p_sampling_cpu")
except AssertionError as e:
raise AssertionError(
f"The results are different between fastdeploy_min_p_sampling and min_p_sampling_cpu:\n{str(e)}"
)
def test_single_min_p_sampling(self):
min_p = paddle.to_tensor([self.min_p_value], dtype="float32")
probs = self.fastdeploy_min_p_sampling(min_p)
probs_cpu = self.min_p_sampling_cpu(min_p)
self.compare_results(probs, probs_cpu)
def test_batch_min_p_sampling(self):
batch_min_p = paddle.to_tensor(self.batch_min_p_values, dtype="float32")
batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, batch_min_p)
batch_probs_cpu = self.min_p_sampling_cpu(batch_min_p)
self.compare_results(batch_probs, batch_probs_cpu)
def test_additional_batch_min_p_sampling(self):
additional_batch_min_p = paddle.to_tensor(self.additional_batch_min_p_values, dtype="float32")
additional_batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, additional_batch_min_p)
additional_batch_probs_cpu = self.min_p_sampling_cpu(additional_batch_min_p)
self.compare_results(additional_batch_probs, additional_batch_probs_cpu)
if __name__ == "__main__":
if paddle.is_compiled_with_cuda():
unittest.main()