[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)

2025-12-24 13:28:13 +08:00 · 2025-11-05 10:43:25 +08:00
parent 74722308f2
commit 1c3ca48128
13 changed files with 203 additions and 22 deletions
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -255,9 +255,11 @@ class Sampler(nn.Layer):
    def compute_logprobs(
        self,
        logits: paddle.Tensor,
-        sampling_metadata: SamplingMetadata,
+        sampling_metadata: Optional[SamplingMetadata] = None,
    ) -> paddle.Tensor:
        """ """
+        if sampling_metadata is None:
+            return F.log_softmax(logits, axis=-1)
        last_logits = logits
        real_bsz = last_logits.shape[0]
        temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs
@@ -317,6 +319,8 @@ class Sampler(nn.Layer):
        assert token_ids.dtype == paddle.int64
        logprobs.clip_(min=paddle.finfo(logprobs.dtype).min)
        # Get with the logprob of the prompt or sampled token.
+        if len(token_ids.shape) < len(logprobs.shape):
+            token_ids = token_ids.unsqueeze(-1)
        token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1)

        # Compute the ranks of the actual token.