[Feature] Support return logprob of generated tokens (#2784)

* online chat support logprobs

* check xpu

* check vl_gpu_model_runner

* only cuda support logprob

* get_worker() check platform

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
chen
2025-07-10 15:47:42 +08:00
committed by GitHub
parent 39d2a1de46
commit 823a47e64a
21 changed files with 592 additions and 105 deletions

View File

@@ -720,7 +720,7 @@ class XPUModelRunner(ModelRunnerBase):
# 4. Compute logits, Sample
logits = self.model.compute_logits(hiddden_states)
sampled_token_ids = self.sampler(logits, self.sampling_metadata)
sampler_output = self.sampler(logits, self.sampling_metadata)
# 5. Speculative decode
@@ -749,7 +749,7 @@ class XPUModelRunner(ModelRunnerBase):
accept_tokens=None,
accept_num=None,
)
xpu_post_process(sampled_token_ids=sampled_token_ids,
xpu_post_process(sampled_token_ids=sampler_output.sampled_token_ids,
model_output=model_output_data)
# 7. Updata 'infer_seed' and step_paddle()