mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)
This commit is contained in:
@@ -255,9 +255,11 @@ class Sampler(nn.Layer):
|
||||
def compute_logprobs(
|
||||
self,
|
||||
logits: paddle.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
sampling_metadata: Optional[SamplingMetadata] = None,
|
||||
) -> paddle.Tensor:
|
||||
""" """
|
||||
if sampling_metadata is None:
|
||||
return F.log_softmax(logits, axis=-1)
|
||||
last_logits = logits
|
||||
real_bsz = last_logits.shape[0]
|
||||
temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs
|
||||
@@ -317,6 +319,8 @@ class Sampler(nn.Layer):
|
||||
assert token_ids.dtype == paddle.int64
|
||||
logprobs.clip_(min=paddle.finfo(logprobs.dtype).min)
|
||||
# Get with the logprob of the prompt or sampled token.
|
||||
if len(token_ids.shape) < len(logprobs.shape):
|
||||
token_ids = token_ids.unsqueeze(-1)
|
||||
token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1)
|
||||
|
||||
# Compute the ranks of the actual token.
|
||||
|
||||
Reference in New Issue
Block a user