[Feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing (#3552)

* [feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing

* infer engine support temp_scaled_logprobs and top_p_normalized_logprobs

* delete some code

* code check

* code check and add doc

* fix tokenizer.decoder(-1), return 'Invalid Token'

* add ci for temp_scaled and top_p logprobs

* check test

* check seq len time shape

* logprob clip inf

---------

Co-authored-by: sunlei1024 <sunlei5788@gmail.com>
This commit is contained in:
chen
2025-08-25 14:11:49 +08:00
committed by GitHub
parent 2410adb041
commit 9cab3f47ff
8 changed files with 195 additions and 8 deletions

View File

@@ -403,6 +403,9 @@ class CompletionRequest(BaseModel):
echo: Optional[bool] = False
frequency_penalty: Optional[float] = None
logprobs: Optional[int] = None
# For logits and logprobs post processing
temp_scaled_logprobs: bool = False
top_p_normalized_logprobs: bool = False
max_tokens: Optional[int] = None
n: int = 1
presence_penalty: Optional[float] = None
@@ -534,6 +537,11 @@ class ChatCompletionRequest(BaseModel):
frequency_penalty: Optional[float] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
# For logits and logprobs post processing
temp_scaled_logprobs: bool = False
top_p_normalized_logprobs: bool = False
# remove max_tokens when field is removed from OpenAI API
max_tokens: Optional[int] = Field(
default=None,
@@ -591,6 +599,8 @@ class ChatCompletionRequest(BaseModel):
req_dict["max_tokens"] = self.max_completion_tokens or self.max_tokens
req_dict["logprobs"] = self.top_logprobs if self.logprobs else None
req_dict["temp_scaled_logprobs"] = self.temp_scaled_logprobs
req_dict["top_p_normalized_logprobs"] = self.top_p_normalized_logprobs
# parse request model into dict, priority: request params > metadata params
if self.metadata is not None: