[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)

This commit is contained in:
chen
2025-11-05 10:43:25 +08:00
committed by GitHub
parent 74722308f2
commit 1c3ca48128
13 changed files with 203 additions and 22 deletions

View File

@@ -44,6 +44,7 @@ from fastdeploy.scheduler.config import SchedulerConfig
from fastdeploy.utils import (
DeprecatedOptionWarning,
FlexibleArgumentParser,
console_logger,
is_port_available,
parse_quantization,
)
@@ -392,6 +393,12 @@ class EngineArgs:
Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
"""
max_logprobs: int = 20
"""
Maximum number of log probabilities to return when `enable_logprob` is True. The default value comes the default for the
OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.
"""
logprobs_mode: str = "raw_logprobs"
"""
Indicates the content returned in the logprobs.
@@ -458,6 +465,13 @@ class EngineArgs:
raise NotImplementedError("Only CUDA platform supports logprob.")
if self.speculative_config is not None and self.logprobs_mode.startswith("processed"):
raise NotImplementedError("processed_logprobs not support in speculative.")
if self.speculative_config is not None and self.max_logprobs == -1:
raise NotImplementedError("max_logprobs=-1 not support in speculative.")
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
self.max_logprobs = 20
console_logger.warning("Set max_logprobs=20 when FD_USE_GET_SAVE_OUTPUT_V1=0")
if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
@@ -672,6 +686,12 @@ class EngineArgs:
default=EngineArgs.enable_logprob,
help="Enable output of token-level log probabilities.",
)
model_group.add_argument(
"--max-logprobs",
type=int,
default=EngineArgs.max_logprobs,
help="Maximum number of log probabilities.",
)
model_group.add_argument(
"--logprobs-mode",
type=str,