[Feature] General support for logprobs (#2974)

* [Feature] support logprobs in chat/completions and completions endpoints

* Temporarily comment out text_offset due to incorrect logic

* Clean up temporary debug prints

* [Feature] support logprobs in offline mode via SamplingParams

* fix: serialize Logprob as dict before zmq send to fix msgpack error

* refactor: remove redundant methods to simplify codebase

* Fix missing fields in CompletionOutput.to_dict affecting msgpack serialization

* refactor: centralize param validation in engine_client to reduce duplication

* revert: rollback changes in offline_demo.py

* revert: rollback changes in offline_demo.py

* [bugfix] fix parameter validation for logprobs

* [bugfix] fix parameter validation for logprobs

* [bugfix] fix parameter validation for logprobs

* [bugfix] fix parameter validation for logprobs

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
SunLei
2025-07-31 20:25:56 +08:00
committed by GitHub
parent fe17410f9c
commit dade19d7a4
10 changed files with 330 additions and 44 deletions

View File

@@ -25,7 +25,7 @@ import numpy as np
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.utils import data_processor_logger
from fastdeploy.worker.output import LogprobsLists
from fastdeploy.worker.output import LogprobsLists, SampleLogprobs
class RequestStatus(Enum):
@@ -245,6 +245,7 @@ class CompletionOutput:
token_ids: list[int]
logprob: Optional[float] = None
top_logprobs: Optional[LogprobsLists] = None
logprobs: Optional[SampleLogprobs] = None
draft_token_ids: list[int] = None
text: Optional[str] = None
reasoning_content: Optional[str] = None
@@ -259,6 +260,7 @@ class CompletionOutput:
"token_ids": self.token_ids,
"logprob": self.logprob,
"top_logprobs": self.top_logprobs,
"logprobs": self.logprobs,
"draft_token_ids": self.draft_token_ids,
"text": self.text,
"reasoning_content": self.reasoning_content,
@@ -281,7 +283,8 @@ class CompletionOutput:
f"text={self.text!r}, "
f"token_ids={self.token_ids}, "
f"draft_token_ids={self.draft_token_ids}, "
f"reasoning_content={self.reasoning_content!r}"
f"reasoning_content={self.reasoning_content!r}, "
f"logprobs={self.logprobs}, "
)
@@ -390,16 +393,20 @@ class RequestOutput:
def add(self, next_output: RequestOutput) -> None:
"""Merge RequestOutput into this one"""
self.prompt = next_output.prompt
self.prompt_token_ids = next_output.prompt_token_ids
self.finished |= next_output.finished
self.outputs.index = next_output.outputs.index
self.outputs.token_ids.extend(next_output.outputs.token_ids)
if next_output.metrics.arrival_time is not None and self.metrics.inference_start_time is not None:
self.metrics.model_forward_time = next_output.metrics.arrival_time - self.metrics.inference_start_time
if next_output.metrics.arrival_time is not None and self.metrics.arrival_time is not None:
self.metrics.model_execute_time = next_output.metrics.arrival_time - self.metrics.arrival_time
if next_output.outputs.top_logprobs is not None:
self.outputs.top_logprobs.logprob_token_ids.extend(next_output.outputs.top_logprobs.logprob_token_ids)
self.outputs.top_logprobs.logprobs.extend(next_output.outputs.top_logprobs.logprobs)
self.outputs.top_logprobs.sampled_token_ranks.extend(next_output.outputs.top_logprobs.sampled_token_ranks)
def __repr__(self) -> str:
return (
@@ -407,8 +414,9 @@ class RequestOutput:
f"prompt={self.prompt!r}, "
f"prompt_token_ids={self.prompt_token_ids}, "
f"outputs={self.outputs}, "
f"finished={self.finished}, "
f"num_cached_tokens={self.num_cached_tokens}, "
f"metrics={self.metrics}, "
f"num_cached_tokens={self.num_cached_tokens})"
)
@classmethod