[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)

This commit is contained in:
chen
2025-11-05 10:43:25 +08:00
committed by GitHub
parent 74722308f2
commit 1c3ca48128
13 changed files with 203 additions and 22 deletions

View File

@@ -255,9 +255,11 @@ class Sampler(nn.Layer):
def compute_logprobs(
self,
logits: paddle.Tensor,
sampling_metadata: SamplingMetadata,
sampling_metadata: Optional[SamplingMetadata] = None,
) -> paddle.Tensor:
""" """
if sampling_metadata is None:
return F.log_softmax(logits, axis=-1)
last_logits = logits
real_bsz = last_logits.shape[0]
temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs
@@ -317,6 +319,8 @@ class Sampler(nn.Layer):
assert token_ids.dtype == paddle.int64
logprobs.clip_(min=paddle.finfo(logprobs.dtype).min)
# Get with the logprob of the prompt or sampled token.
if len(token_ids.shape) < len(logprobs.shape):
token_ids = token_ids.unsqueeze(-1)
token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1)
# Compute the ranks of the actual token.