[Feature] Support return logprob of generated tokens (#2784)

* online chat support logprobs * check xpu * check vl_gpu_model_runner * only cuda support logprob * get_worker() check platform --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-05 16:48:03 +08:00 · 2025-07-10 15:47:42 +08:00
parent 39d2a1de46
commit 823a47e64a
21 changed files with 592 additions and 105 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -296,6 +296,12 @@ class EngineArgs:
        max_capture_batch_size=64, FastDeploy will capture graphs for batches [1,64].
    """

+    enable_logprob: bool = False
+    """
+    Flag to enable logprob output. Default is False (disabled).
+    Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -416,6 +422,11 @@ class EngineArgs:
            help=
            "Disabled any whitespaces when using guided decoding backend XGrammar."
        )
+        model_group.add_argument("--enable-logprob",
+                                 action="store_true",
+                                 default=EngineArgs.enable_logprob,
+                                 help="Enable output of token-level log probabilities."
+                                 )

        # Parallel processing parameters group
        parallel_group = parser.add_argument_group("Parallel Configuration")
@@ -791,4 +802,5 @@ class EngineArgs:
            max_capture_batch_size=self.max_capture_batch_size,
            guided_decoding_backend=self.guided_decoding_backend,
            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
+            enable_logprob = self.enable_logprob,
        )