diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 678ae8dd0..b419834a3 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -128,6 +128,8 @@ class ChatMessage(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None class ChatCompletionResponseChoice(BaseModel): @@ -187,6 +189,8 @@ class DeltaMessage(BaseModel): tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None class ChatCompletionResponseStreamChoice(BaseModel): @@ -225,6 +229,8 @@ class CompletionResponseChoice(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None arrival_time: Optional[float] = None logprobs: Optional[CompletionLogprobs] = None reasoning_content: Optional[str] = None @@ -269,6 +275,8 @@ class CompletionResponseStreamChoice(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 536cd7d80..32e94d6d5 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -224,6 +224,7 @@ class OpenAIServingChat: if request.return_token_ids: choice.delta.prompt_token_ids = list(prompt_token_ids) choice.delta.text_after_process = text_after_process + choice.delta.prompt_tokens = text_after_process chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, @@ -290,6 +291,7 @@ class OpenAIServingChat: if request.return_token_ids: choice.delta.completion_token_ids = list(output["token_ids"]) choice.delta.raw_prediction = output.get("raw_prediction") + choice.delta.completion_tokens = output.get("raw_prediction") if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, @@ -423,7 +425,9 @@ class OpenAIServingChat: prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, text_after_process=text_after_process if request.return_token_ids else None, + prompt_tokens=text_after_process if request.return_token_ids else None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, ) logprobs_full_res = None if logprob_contents: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index cec597f78..119dbbdf4 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -317,6 +317,7 @@ class OpenAIServingCompletion: text="", prompt_token_ids=list(prompt_batched_token_ids[idx]), text_after_process=text_after_process_list[idx], + prompt_tokens=text_after_process_list[idx], completion_token_ids=None, ) ], @@ -349,6 +350,7 @@ class OpenAIServingCompletion: prompt_token_ids=None, completion_token_ids=output.get("token_ids") if request.return_token_ids else None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, tool_calls=output.get("tool_call_content"), reasoning_content=output.get("reasoning_content"), arrival_time=arrival_time, @@ -467,7 +469,9 @@ class OpenAIServingCompletion: prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, text_after_process=text_after_process_list[idx] if request.return_token_ids else None, + prompt_tokens=text_after_process_list[idx] if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), logprobs=aggregated_logprobs,