Files
FastDeploy/test/layers/test_min_sampling.py
lizexu123 67990e0572
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
[Feature] support min_p_sampling (#2872)
* Fastdeploy support min_p

* add test_min_p

* fix

* min_p_sampling

* update

* delete vl_gpu_model_runner.py

* fix

* Align usage of min_p with vLLM

* fix

* modified unit test

* fix test_min_sampling

* pre-commit all files

* fix

* fix

* fix

* fix xpu_model_runner.py
2025-07-20 23:17:59 -07:00

114 lines
4.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import paddle.nn.functional as F
from fastdeploy.model_executor.ops.gpu import min_p_sampling
class TestMinPSampling(unittest.TestCase):
def setUp(self):
self.sample_time = 1000000
self.vocab_size = 1000
self.min_p_value = 0.5
self.batch_size = 3
self.batch_min_p_values = [0.1, 0.0, 0.9]
self.additional_batch_min_p_values = [0.1, 0.0, 0.3]
# min_p:0.5FastDeploy
def min_p_sampling_cpu(self, min_p):
logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
logits[0][0] = 10
logits[0][1] = 8
low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
logits[0][2:] = low_prob_tensor
probs = F.softmax(logits)
max_probabilities = paddle.amax(probs, axis=-1, keepdim=True)
adjusted_min_p = max_probabilities * min_p.reshape([-1, 1])
invalid_token_mask = probs < adjusted_min_p
probs = paddle.where(invalid_token_mask, paddle.full_like(probs, 0.0), probs)
return probs
# min_p:0.5FastDeploy
def fastdeploy_min_p_sampling(self, min_p):
logits = paddle.ones(shape=[1, self.vocab_size], dtype="float32")
logits[0][0] = 10
logits[0][1] = 8
low_prob_tensor = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
logits[0][2:] = low_prob_tensor
probs = F.softmax(logits)
probs = min_p_sampling(probs, min_p)
return probs
# batch:[0.1.0.0,0.9]FastDeploy
def fastdeploy_batch_min_p_sampling(self, batch_size, min_p_values):
logits = paddle.ones(shape=[batch_size, self.vocab_size], dtype="float32")
for b in range(batch_size):
logits[b][0] = 10
logits[b][1] = 8
logits[b][2:] = paddle.linspace(2.0, 0.0, self.vocab_size - 2)
probs = F.softmax(logits, axis=-1)
min_p_arr = paddle.to_tensor(min_p_values, dtype="float32")
probs = min_p_sampling(probs, min_p_arr)
return probs
def compare_results(self, probs, probs_cpu, atol=1e-6, rtol=1e-6):
probs_np = probs.numpy()
probs_cpu_np = probs_cpu.numpy()
try:
np.testing.assert_allclose(
probs_np,
probs_cpu_np,
rtol=rtol,
atol=atol,
)
print("The results are same between fastdeploy_min_p_sampling and min_p_sampling_cpu")
except AssertionError as e:
raise AssertionError(
f"The results are different between fastdeploy_min_p_sampling and min_p_sampling_cpu:\n{str(e)}"
)
def test_single_min_p_sampling(self):
min_p = paddle.to_tensor([self.min_p_value], dtype="float32")
probs = self.fastdeploy_min_p_sampling(min_p)
probs_cpu = self.min_p_sampling_cpu(min_p)
self.compare_results(probs, probs_cpu)
def test_batch_min_p_sampling(self):
batch_min_p = paddle.to_tensor(self.batch_min_p_values, dtype="float32")
batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, batch_min_p)
batch_probs_cpu = self.min_p_sampling_cpu(batch_min_p)
self.compare_results(batch_probs, batch_probs_cpu)
def test_additional_batch_min_p_sampling(self):
additional_batch_min_p = paddle.to_tensor(self.additional_batch_min_p_values, dtype="float32")
additional_batch_probs = self.fastdeploy_batch_min_p_sampling(self.batch_size, additional_batch_min_p)
additional_batch_probs_cpu = self.min_p_sampling_cpu(additional_batch_min_p)
self.compare_results(additional_batch_probs, additional_batch_probs_cpu)
if __name__ == "__main__":
if paddle.is_compiled_with_cuda():
unittest.main()