[Feature] Support return logprob of generated tokens (#2784)

* online chat support logprobs * check xpu * check vl_gpu_model_runner * only cuda support logprob * get_worker() check platform --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-05 16:48:03 +08:00 · 2025-07-10 15:47:42 +08:00
parent 39d2a1de46
commit 823a47e64a
21 changed files with 592 additions and 105 deletions
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -720,7 +720,7 @@ class XPUModelRunner(ModelRunnerBase):
        # 4. Compute logits, Sample
        logits = self.model.compute_logits(hiddden_states)

-        sampled_token_ids = self.sampler(logits, self.sampling_metadata)
+        sampler_output = self.sampler(logits, self.sampling_metadata)

        # 5. Speculative decode

@@ -749,7 +749,7 @@ class XPUModelRunner(ModelRunnerBase):
            accept_tokens=None,
            accept_num=None,
        )
-        xpu_post_process(sampled_token_ids=sampled_token_ids,
+        xpu_post_process(sampled_token_ids=sampler_output.sampled_token_ids,
                         model_output=model_output_data)

        # 7. Updata 'infer_seed' and step_paddle()