From 31f639f10b81d9974418b943b4571d83431935d7 Mon Sep 17 00:00:00 2001 From: memoryCoderC <1137889088@qq.com> Date: Thu, 21 Aug 2025 10:23:27 +0800 Subject: [PATCH] [Feature] add prompt_tokens and completion_tokens (#3504) --- fastdeploy/entrypoints/openai/protocol.py | 8 ++++++++ fastdeploy/entrypoints/openai/serving_chat.py | 4 ++++ fastdeploy/entrypoints/openai/serving_completion.py | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 508c27f06..ad9de2855 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -139,6 +139,8 @@ class ChatMessage(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None class ChatCompletionResponseChoice(BaseModel): @@ -198,6 +200,8 @@ class DeltaMessage(BaseModel): tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None class ChatCompletionResponseStreamChoice(BaseModel): @@ -236,6 +240,8 @@ class CompletionResponseChoice(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None arrival_time: Optional[float] = None logprobs: Optional[CompletionLogprobs] = None reasoning_content: Optional[str] = None @@ -280,6 +286,8 @@ class CompletionResponseStreamChoice(BaseModel): completion_token_ids: Optional[List[int]] = None text_after_process: Optional[str] = None raw_prediction: Optional[str] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index ba277a387..5f9a99958 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -237,6 +237,7 @@ class OpenAIServingChat: if request.return_token_ids: choice.delta.prompt_token_ids = list(prompt_token_ids) choice.delta.text_after_process = text_after_process + choice.delta.prompt_tokens = text_after_process chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, @@ -308,6 +309,7 @@ class OpenAIServingChat: if request.return_token_ids: choice.delta.completion_token_ids = list(output["token_ids"]) choice.delta.raw_prediction = output.get("raw_prediction") + choice.delta.completion_tokens = output.get("raw_prediction") if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, @@ -442,7 +444,9 @@ class OpenAIServingChat: prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, text_after_process=text_after_process if request.return_token_ids else None, + prompt_tokens=text_after_process if request.return_token_ids else None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, ) logprobs_full_res = None if logprob_contents: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index fdcf106ba..c6ee86d2f 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -343,6 +343,7 @@ class OpenAIServingCompletion: text="", prompt_token_ids=list(prompt_batched_token_ids[idx]), text_after_process=text_after_process_list[idx], + prompt_tokens=text_after_process_list[idx], completion_token_ids=None, ) ], @@ -393,6 +394,7 @@ class OpenAIServingCompletion: completion_token_ids=output.get("token_ids") if request.return_token_ids else None, tool_calls=None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), arrival_time=arrival_time, logprobs=logprobs_res, @@ -511,7 +513,9 @@ class OpenAIServingCompletion: prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, text_after_process=text_after_process_list[idx] if request.return_token_ids else None, + prompt_tokens=text_after_process_list[idx] if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call"), logprobs=aggregated_logprobs,