mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature][Executor] GPU Model Runner Supports prompt_logprobs and max_logprobs (#4769)
This commit is contained in:
@@ -44,6 +44,7 @@ from fastdeploy.scheduler.config import SchedulerConfig
|
||||
from fastdeploy.utils import (
|
||||
DeprecatedOptionWarning,
|
||||
FlexibleArgumentParser,
|
||||
console_logger,
|
||||
is_port_available,
|
||||
parse_quantization,
|
||||
)
|
||||
@@ -392,6 +393,12 @@ class EngineArgs:
|
||||
Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
|
||||
"""
|
||||
|
||||
max_logprobs: int = 20
|
||||
"""
|
||||
Maximum number of log probabilities to return when `enable_logprob` is True. The default value comes the default for the
|
||||
OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length * vocab_size) logprobs are allowed to be returned and it may cause OOM.
|
||||
"""
|
||||
|
||||
logprobs_mode: str = "raw_logprobs"
|
||||
"""
|
||||
Indicates the content returned in the logprobs.
|
||||
@@ -458,6 +465,13 @@ class EngineArgs:
|
||||
raise NotImplementedError("Only CUDA platform supports logprob.")
|
||||
if self.speculative_config is not None and self.logprobs_mode.startswith("processed"):
|
||||
raise NotImplementedError("processed_logprobs not support in speculative.")
|
||||
if self.speculative_config is not None and self.max_logprobs == -1:
|
||||
raise NotImplementedError("max_logprobs=-1 not support in speculative.")
|
||||
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
|
||||
self.max_logprobs = 20
|
||||
console_logger.warning("Set max_logprobs=20 when FD_USE_GET_SAVE_OUTPUT_V1=0")
|
||||
if self.max_logprobs == -1 and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
|
||||
|
||||
if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
|
||||
envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
|
||||
@@ -672,6 +686,12 @@ class EngineArgs:
|
||||
default=EngineArgs.enable_logprob,
|
||||
help="Enable output of token-level log probabilities.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--max-logprobs",
|
||||
type=int,
|
||||
default=EngineArgs.max_logprobs,
|
||||
help="Maximum number of log probabilities.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--logprobs-mode",
|
||||
type=str,
|
||||
|
||||
Reference in New Issue
Block a user