[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)

2025-12-24 13:28:13 +08:00 · 2025-11-05 10:43:25 +08:00
parent 74722308f2
commit 1c3ca48128
13 changed files with 203 additions and 22 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -44,6 +44,7 @@ from fastdeploy.scheduler.config import SchedulerConfig
 from fastdeploy.utils import (
    DeprecatedOptionWarning,
    FlexibleArgumentParser,
+    console_logger,
    is_port_available,
    parse_quantization,
 )
@@ -392,6 +393,12 @@ class EngineArgs:
    Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
    """

+    max_logprobs: int = 20
+    """
+    Maximum number of log probabilities to return when `enable_logprob` is True. The default value comes the default for the
+    OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.
+    """
+
    logprobs_mode: str = "raw_logprobs"
    """
    Indicates the content returned in the logprobs.
@@ -458,6 +465,13 @@ class EngineArgs:
                raise NotImplementedError("Only CUDA platform supports logprob.")
            if self.speculative_config is not None and self.logprobs_mode.startswith("processed"):
                raise NotImplementedError("processed_logprobs not support in speculative.")
+            if self.speculative_config is not None and self.max_logprobs == -1:
+                raise NotImplementedError("max_logprobs=-1 not support in speculative.")
+            if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
+                self.max_logprobs = 20
+                console_logger.warning("Set max_logprobs=20 when FD_USE_GET_SAVE_OUTPUT_V1=0")
+            if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
+                raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")

        if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
@@ -672,6 +686,12 @@ class EngineArgs:
            default=EngineArgs.enable_logprob,
            help="Enable output of token-level log probabilities.",
        )
+        model_group.add_argument(
+            "--max-logprobs",
+            type=int,
+            default=EngineArgs.max_logprobs,
+            help="Maximum number of log probabilities.",
+        )
        model_group.add_argument(
            "--logprobs-mode",
            type=str,