diff --git a/docs/get_started/installation/intel_gaudi.md b/docs/get_started/installation/intel_gaudi.md index 93c5504fd..5e4e9edc1 100644 --- a/docs/get_started/installation/intel_gaudi.md +++ b/docs/get_started/installation/intel_gaudi.md @@ -71,5 +71,5 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ ### 3. Successfully returns the result ```json -{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}} +{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}} ``` diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index c00b62e31..587919095 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -231,8 +231,18 @@ ChatMessage: role: str content: str reasoning_content: Optional[str] = None + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None +ToolCall: + id: str = None + type: Literal["function"] = "function" + function: FunctionCall +FunctionCall: + name: str + arguments: str # Fields returned for streaming responses ChatCompletionStreamResponse: @@ -254,6 +264,17 @@ DeltaMessage: prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None +DeltaToolCall: + id: Optional[str] = None + type: Optional[Literal["function"]] = None + index: int + function: Optional[DeltaFunctionCall] = None +DeltaFunctionCall: + name: Optional[str] = None + arguments: Optional[str] = None ``` ## Completion API @@ -384,10 +405,20 @@ CompletionResponseChoice: text: str prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None arrival_time: Optional[float] = None logprobs: Optional[int] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None +ToolCall: + id: str = None + type: Literal["function"] = "function" + function: FunctionCall +FunctionCall: + name: str + arguments: str # Fields returned for streaming responses CompletionStreamResponse: @@ -403,8 +434,18 @@ CompletionResponseStreamChoice: arrival_time: float = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None logprobs: Optional[float] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None - + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None +DeltaToolCall: + id: Optional[str] = None + type: Optional[Literal["function"]] = None + index: int + function: Optional[DeltaFunctionCall] = None +DeltaFunctionCall: + name: Optional[str] = None + arguments: Optional[str] = None ``` diff --git a/docs/zh/get_started/installation/intel_gaudi.md b/docs/zh/get_started/installation/intel_gaudi.md index e8b46aa0b..5f0577b3b 100644 --- a/docs/zh/get_started/installation/intel_gaudi.md +++ b/docs/zh/get_started/installation/intel_gaudi.md @@ -71,5 +71,5 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ ### 3. 成功返回结果 ```json -{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}} +{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}} ``` diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index 250c24b37..35f5e1349 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -230,8 +230,18 @@ ChatMessage: role: str content: str reasoning_content: Optional[str] = None + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None +ToolCall: + id: str = None + type: Literal["function"] = "function" + function: FunctionCall +FunctionCall: + name: str + arguments: str # 返回流式响应的字段 ChatCompletionStreamResponse: @@ -253,6 +263,17 @@ DeltaMessage: prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None +DeltaToolCall: + id: Optional[str] = None + type: Optional[Literal["function"]] = None + index: int + function: Optional[DeltaFunctionCall] = None +DeltaFunctionCall: + name: Optional[str] = None + arguments: Optional[str] = None ``` ## Completion API @@ -380,10 +401,20 @@ CompletionResponseChoice: text: str prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None arrival_time: Optional[float] = None logprobs: Optional[int] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None +ToolCall: + id: str = None + type: Literal["function"] = "function" + function: FunctionCall +FunctionCall: + name: str + arguments: str # 返回流式响应的字段 CompletionStreamResponse: @@ -399,8 +430,18 @@ CompletionResponseStreamChoice: arrival_time: float = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None + prompt_tokens: Optional[str] = None + completion_tokens: Optional[str] = None logprobs: Optional[float] = None reasoning_content: Optional[str] = None finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None - + tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None +DeltaToolCall: + id: Optional[str] = None + type: Optional[Literal["function"]] = None + index: int + function: Optional[DeltaFunctionCall] = None +DeltaFunctionCall: + name: Optional[str] = None + arguments: Optional[str] = None ``` diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index c7f87d3f1..d6c0c74bb 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -193,8 +193,6 @@ class ChatMessage(BaseModel): tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None - text_after_process: Optional[str] = None - raw_prediction: Optional[str] = None prompt_tokens: Optional[str] = None completion_tokens: Optional[str] = None @@ -255,8 +253,6 @@ class DeltaMessage(BaseModel): completion_token_ids: Optional[List[int]] = None reasoning_content: Optional[str] = None tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None - text_after_process: Optional[str] = None - raw_prediction: Optional[str] = None prompt_tokens: Optional[str] = None completion_tokens: Optional[str] = None @@ -295,8 +291,6 @@ class CompletionResponseChoice(BaseModel): text: str prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None - text_after_process: Optional[str] = None - raw_prediction: Optional[str] = None prompt_tokens: Optional[str] = None completion_tokens: Optional[str] = None arrival_time: Optional[float] = None @@ -341,8 +335,6 @@ class CompletionResponseStreamChoice(BaseModel): logprobs: Optional[CompletionLogprobs] = None prompt_token_ids: Optional[List[int]] = None completion_token_ids: Optional[List[int]] = None - text_after_process: Optional[str] = None - raw_prediction: Optional[str] = None prompt_tokens: Optional[str] = None completion_tokens: Optional[str] = None reasoning_content: Optional[str] = None diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 2a11b04ed..be647bf86 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -118,14 +118,14 @@ class OpenAIServingChat: else: request_id = f"chatcmpl-{uuid.uuid4()}" api_server_logger.info(f"create chat completion request: {request_id}") - text_after_process = None + prompt_tokens = None try: current_req_dict = request.to_dict_for_infer(request_id) if "chat_template" not in current_req_dict: current_req_dict["chat_template"] = self.chat_template current_req_dict["arrival_time"] = time.time() prompt_token_ids = await self.engine_client.format_and_add_data(current_req_dict) - text_after_process = current_req_dict.get("text_after_process") + prompt_tokens = current_req_dict.get("prompt_tokens") if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() except ParameterError as e: @@ -143,12 +143,12 @@ class OpenAIServingChat: if request.stream: return self.chat_completion_stream_generator( - request, request_id, request.model, prompt_token_ids, text_after_process + request, request_id, request.model, prompt_token_ids, prompt_tokens ) else: try: return await self.chat_completion_full_generator( - request, request_id, request.model, prompt_token_ids, text_after_process + request, request_id, request.model, prompt_token_ids, prompt_tokens ) except Exception as e: error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}" @@ -175,7 +175,7 @@ class OpenAIServingChat: request_id: str, model_name: str, prompt_token_ids: list(), - text_after_process: str, + prompt_tokens: str, ): """ Streaming chat completion generator. @@ -289,8 +289,7 @@ class OpenAIServingChat: if request.return_token_ids: choice.delta.prompt_token_ids = list(prompt_token_ids) - choice.delta.text_after_process = text_after_process - choice.delta.prompt_tokens = text_after_process + choice.delta.prompt_tokens = prompt_tokens chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, @@ -368,8 +367,7 @@ class OpenAIServingChat: choice.delta.multimodal_content[0]["completion_token_ids"] = list(output["token_ids"]) else: choice.delta.completion_token_ids = list(output["token_ids"]) - choice.delta.raw_prediction = output.get("raw_prediction") - choice.delta.completion_tokens = output.get("raw_prediction") + choice.delta.completion_tokens = output.get("completion_tokens") if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, @@ -419,7 +417,7 @@ class OpenAIServingChat: request_id: str, model_name: str, prompt_token_ids: list(), - text_after_process: str, + prompt_tokens: str, ): """ Full chat completion generator. @@ -509,10 +507,8 @@ class OpenAIServingChat: tool_calls=output.get("tool_call"), prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, - text_after_process=text_after_process if request.return_token_ids else None, - prompt_tokens=text_after_process if request.return_token_ids else None, - raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, - completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, + prompt_tokens=prompt_tokens if request.return_token_ids else None, + completion_tokens=output.get("completion_tokens") if request.return_token_ids else None, ) if response_processor.enable_multimodal_content(): diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index c28907844..03eb64825 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -132,7 +132,7 @@ class OpenAIServingCompletion: num_choices = len(request_prompts) api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}") prompt_batched_token_ids = [] - text_after_process_list = [] + prompt_tokens_list = [] try: if self.max_waiting_time < 0: await self.engine_client.semaphore.acquire() @@ -157,7 +157,7 @@ class OpenAIServingCompletion: prompt_token_ids = await self.engine_client.format_and_add_data(current_req_dict) # tokenize if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() - text_after_process_list.append(current_req_dict.get("text_after_process")) + prompt_tokens_list.append(current_req_dict.get("prompt_tokens")) prompt_batched_token_ids.append(prompt_token_ids) del current_req_dict except ParameterError as e: @@ -180,7 +180,7 @@ class OpenAIServingCompletion: created_time=created_time, model_name=request.model, prompt_batched_token_ids=prompt_batched_token_ids, - text_after_process_list=text_after_process_list, + prompt_tokens_list=prompt_tokens_list, ) else: try: @@ -191,7 +191,7 @@ class OpenAIServingCompletion: created_time=created_time, model_name=request.model, prompt_batched_token_ids=prompt_batched_token_ids, - text_after_process_list=text_after_process_list, + prompt_tokens_list=prompt_tokens_list, ) except Exception as e: error_msg = ( @@ -213,7 +213,7 @@ class OpenAIServingCompletion: created_time: int, model_name: str, prompt_batched_token_ids: list(), - text_after_process_list: list(), + prompt_tokens_list: list(), ): """ Process the full completion request with multiple choices. @@ -292,7 +292,7 @@ class OpenAIServingCompletion: model_name=model_name, prompt_batched_token_ids=prompt_batched_token_ids, completion_batched_token_ids=completion_batched_token_ids, - text_after_process_list=text_after_process_list, + prompt_tokens_list=prompt_tokens_list, ) api_server_logger.info(f"Completion response: {res.model_dump_json()}") return res @@ -344,7 +344,7 @@ class OpenAIServingCompletion: created_time: int, model_name: str, prompt_batched_token_ids: list(), - text_after_process_list: list(), + prompt_tokens_list: list(), ): """ Process the stream completion request. @@ -408,8 +408,7 @@ class OpenAIServingCompletion: index=idx, text="", prompt_token_ids=list(prompt_batched_token_ids[idx]), - text_after_process=text_after_process_list[idx], - prompt_tokens=text_after_process_list[idx], + prompt_tokens=prompt_tokens_list[idx], completion_token_ids=None, ) ], @@ -443,8 +442,7 @@ class OpenAIServingCompletion: prompt_token_ids=None, completion_token_ids=output.get("token_ids") if request.return_token_ids else None, tool_calls=None, - raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, - completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, + completion_tokens=output.get("completion_tokens") if request.return_token_ids else None, reasoning_content="", arrival_time=arrival_time, logprobs=logprobs_res, @@ -522,7 +520,7 @@ class OpenAIServingCompletion: model_name: str, prompt_batched_token_ids: list(), completion_batched_token_ids: list(), - text_after_process_list: list(), + prompt_tokens_list: list(), ) -> CompletionResponse: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 @@ -556,10 +554,8 @@ class OpenAIServingCompletion: text=output_text, prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, - raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, - completion_tokens=output.get("raw_prediction") if request.return_token_ids else None, - text_after_process=text_after_process_list[idx] if request.return_token_ids else None, - prompt_tokens=text_after_process_list[idx] if request.return_token_ids else None, + completion_tokens=output.get("completion_tokens") if request.return_token_ids else None, + prompt_tokens=prompt_tokens_list[idx] if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call"), logprobs=aggregated_logprobs, diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index 8d2463a08..18edebf1a 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -197,7 +197,7 @@ class Ernie4_5Processor(BaseDataProcessor): if isinstance(prompt, list): # if prompt is a token id list request["prompt_token_ids"] = prompt else: - request["text_after_process"] = prompt + request["prompt_tokens"] = prompt tokens = self.tokenizer.tokenize(prompt) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) request["prompt_token_ids"] = token_ids @@ -318,7 +318,7 @@ class Ernie4_5Processor(BaseDataProcessor): if tool_call_info.tools_called: response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls response_dict["outputs"]["text"] = tool_call_info.content - response_dict["outputs"]["raw_prediction"] = full_text + response_dict["outputs"]["completion_tokens"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -342,7 +342,7 @@ class Ernie4_5Processor(BaseDataProcessor): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_dict["outputs"]["raw_prediction"] = delta_text + response_dict["outputs"]["completion_tokens"] = delta_text if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): @@ -398,7 +398,7 @@ class Ernie4_5Processor(BaseDataProcessor): add_special_tokens=False, **kwargs, ) - request_or_messages["text_after_process"] = spliced_message + request_or_messages["prompt_tokens"] = spliced_message req_id = None if isinstance(request_or_messages, dict): req_id = request_or_messages.get("request_id", None) diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 9251dd9d9..439b752c2 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -222,7 +222,7 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor): self._check_mm_limits(multimodal_data) images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) - request["text_after_process"] = request.get("prompt") + request["prompt_tokens"] = request.get("prompt") outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos) elif request.get("messages"): messages = request["messages"] diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index ea22850dd..c3671c943 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -503,7 +503,7 @@ class DataProcessor: prompt_token_str = prompt_token_template.replace("<|image@placeholder|>", "").replace( "<|video@placeholder|>", "" ) - request["text_after_process"] = prompt_token_template + request["prompt_tokens"] = prompt_token_template tokens = self.tokenizer.tokenize(prompt_token_str) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) data_processor_logger.info( diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 9b480737a..53a9381ae 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -495,7 +495,7 @@ class DataProcessor: add_generation_prompt=request.get("add_generation_prompt", True), ) prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "") - request["text_after_process"] = raw_prompt + request["prompt_tokens"] = raw_prompt tokens = self.tokenizer.tokenize(prompt_token_str) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index a29e1b260..47891b51e 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -403,7 +403,7 @@ class DataProcessor(BaseDataProcessor): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text - response_dict["outputs"]["raw_prediction"] = full_text + response_dict["outputs"]["completion_tokens"] = full_text if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text @@ -439,7 +439,7 @@ class DataProcessor(BaseDataProcessor): if token_ids[-1] in self.eos_token_ids: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_dict["outputs"]["raw_prediction"] = delta_text + response_dict["outputs"]["completion_tokens"] = delta_text if self.reasoning_parser and ( enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" ): @@ -548,7 +548,7 @@ class DataProcessor(BaseDataProcessor): return_tensors="pd", **kwargs, ) - request["text_after_process"] = spliced_message + request["prompt_tokens"] = spliced_message req_id = None tokens = self.tokenizer.tokenize(spliced_message) if isinstance(request, dict): diff --git a/tests/ce/server/test_return_token_ids.py b/tests/ce/server/test_return_token_ids.py index ccaf496af..c6d17c62f 100644 --- a/tests/ce/server/test_return_token_ids.py +++ b/tests/ce/server/test_return_token_ids.py @@ -14,10 +14,10 @@ from core import TEMPLATE, URL, build_request_payload, send_request COMPLETIONS_URL = URL.replace("/v1/chat/completions", "/v1/completions") -def test_completion_stream_text_after_process_raw_prediction(): +def test_completion_stream_prompt_tokens_completion_tokens(): """ /v1/completions接口, stream=True - 返回属性"text_after_process"和"reasoning_content" + return "prompt_tokens"和"reasoning_content" """ data = { "prompt": "你是谁", @@ -39,55 +39,55 @@ def test_completion_stream_text_after_process_raw_prediction(): choice = response_data["choices"][0] if "prompt_token_ids" in choice and choice["prompt_token_ids"] is not None: - text_after_process = choice["text_after_process"] - assert data["prompt"] in text_after_process, "text_after_process取值结果不正确" + prompt_tokens = choice["prompt_tokens"] + assert data["prompt"] in prompt_tokens, "prompt_tokens取值结果不正确" else: - raw_prediction = choice["raw_prediction"] + completion_tokens = choice["completion_tokens"] reasoning_content = choice["reasoning_content"] text = choice["text"] - assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确" + assert reasoning_content or text in completion_tokens, "completion_tokens取值结果不正确" if "finish_reason" in line.strip(): break -def test_completion_text_after_process_raw_predictio_return_token_ids(): +def test_completion_prompt_tokens_completion_tokens_return_token_ids(): """ /v1/completions接口,非流式接口 - 返回属性"text_after_process"和"reasoning_content" + return "prompt_tokens"和"reasoning_content" """ data = {"stream": False, "prompt": "你是谁", "max_tokens": 50, "return_token_ids": True} payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload).json() - text_after_process = resp["choices"][0]["text_after_process"] - assert data["prompt"] in text_after_process, "text_after_process取值结果不正确" + prompt_tokens = resp["choices"][0]["prompt_tokens"] + assert data["prompt"] in prompt_tokens, "prompt_tokens取值结果不正确" - raw_prediction = resp["choices"][0]["raw_prediction"] + completion_tokens = resp["choices"][0]["completion_tokens"] reasoning_content = resp["choices"][0]["reasoning_content"] text = resp["choices"][0]["text"] - assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确" + assert reasoning_content or text in completion_tokens, "completion_tokens取值结果不正确" -def test_completion_text_after_process_raw_prediction(): +def test_completion_prompt_tokens_completion_tokens(): """ /v1/completions接口,无return_token_ids参数 - 非流式接口中,无return token ids 属性"text_after_process"和"reasoning_content"值为null + 非流式接口中,无return token ids 属性"prompt_tokens"和"reasoning_content"值为null """ data = {"stream": False, "prompt": "你是谁", "max_tokens": 50} payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload).json() - text_after_process = resp["choices"][0]["text_after_process"] - assert text_after_process is None, "text_after_process取值结果不正确" + prompt_tokens = resp["choices"][0]["prompt_tokens"] + assert prompt_tokens is None, "prompt_tokens取值结果不正确" - raw_prediction = resp["choices"][0]["raw_prediction"] - assert raw_prediction is None, "raw_prediction取值结果不正确" + completion_tokens = resp["choices"][0]["completion_tokens"] + assert completion_tokens is None, "completion_tokens取值结果不正确" -def test_stream_text_after_process_raw_prediction(): +def test_stream_prompt_tokens_completion_tokens(): """ /v1/chat/completions接口,"stream": True - 返回属性"text_after_process"和"reasoning_content" + 返回属性"prompt_tokens"和"reasoning_content" """ data = { "messages": [{"role": "user", "content": "你是谁"}], @@ -109,21 +109,21 @@ def test_stream_text_after_process_raw_prediction(): choice = response_data["choices"][0] if "prompt_token_ids" in choice["delta"] and choice["delta"]["prompt_token_ids"] is not None: - text_after_process = choice["delta"]["text_after_process"] - assert data["messages"][0]["content"] in text_after_process, "text_after_process取值结果不正确" + prompt_tokens = choice["delta"]["prompt_tokens"] + assert data["messages"][0]["content"] in prompt_tokens, "prompt_tokens取值结果不正确" else: - raw_prediction = choice["delta"]["raw_prediction"] + completion_tokens = choice["delta"]["completion_tokens"] reasoning_content = choice["delta"]["reasoning_content"] content = choice["delta"]["content"] - assert reasoning_content or content in raw_prediction, "raw_prediction取值结果不正确" + assert reasoning_content or content in completion_tokens, "completion_tokens取值结果不正确" if "finish_reason" in line.strip(): break -def test_text_after_process_raw_prediction_return_token_ids(): +def test_prompt_tokens_completion_tokens_return_token_ids(): """ /v1/chat/completions接口,非流式接口 - 返回属性"text_after_process"和"reasoning_content" + 返回属性"prompt_tokens"和"reasoning_content" """ data = { "stream": False, @@ -136,19 +136,19 @@ def test_text_after_process_raw_prediction_return_token_ids(): payload = build_request_payload(TEMPLATE, data) resp = send_request(URL, payload).json() - text_after_process = resp["choices"][0]["message"]["text_after_process"] - assert data["messages"][0]["content"] in text_after_process, "text_after_process取值结果不正确" + prompt_tokens = resp["choices"][0]["message"]["prompt_tokens"] + assert data["messages"][0]["content"] in prompt_tokens, "prompt_tokens取值结果不正确" - raw_prediction = resp["choices"][0]["message"]["raw_prediction"] + completion_tokens = resp["choices"][0]["message"]["completion_tokens"] reasoning_content = resp["choices"][0]["message"]["reasoning_content"] text = resp["choices"][0]["message"]["content"] - assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确" + assert reasoning_content or text in completion_tokens, "completion_tokens取值结果不正确" -def test_text_after_process_raw_prediction(): +def test_prompt_tokens_completion_tokens(): """ /v1/chat/completions接口,无return_token_ids参数 - 无return token ids 属性"text_after_process"和"reasoning_content"值为null + 无return token ids 属性"prompt_tokens"和"reasoning_content"值为null """ data = { "stream": False, @@ -160,8 +160,8 @@ def test_text_after_process_raw_prediction(): payload = build_request_payload(TEMPLATE, data) resp = send_request(URL, payload).json() - text_after_process = resp["choices"][0]["message"]["text_after_process"] - assert text_after_process is None, "text_after_process取值结果不正确" + prompt_tokens = resp["choices"][0]["message"]["prompt_tokens"] + assert prompt_tokens is None, "prompt_tokens取值结果不正确" - raw_prediction = resp["choices"][0]["message"]["raw_prediction"] - assert raw_prediction is None, "raw_prediction取值结果不正确" + completion_tokens = resp["choices"][0]["message"]["completion_tokens"] + assert completion_tokens is None, "completion_tokens取值结果不正确" diff --git a/tests/entrypoints/openai/test_completion_echo.py b/tests/entrypoints/openai/test_completion_echo.py index 3e8d8ac79..a2d4313db 100644 --- a/tests/entrypoints/openai/test_completion_echo.py +++ b/tests/entrypoints/openai/test_completion_echo.py @@ -57,7 +57,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): model_name="test_model", prompt_batched_token_ids=[[1, 2]], completion_batched_token_ids=[[3, 4, 5]], - text_after_process_list=["test prompt"], + prompt_tokens_list=["test prompt"], ) self.assertEqual(response.choices[0].text, "test prompt generated text") @@ -90,7 +90,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): model_name="test_model", prompt_batched_token_ids=[[1, 2]], completion_batched_token_ids=[[3, 4, 5]], - text_after_process_list=["test prompt"], + prompt_tokens_list=["test prompt"], ) self.assertEqual(response.choices[0].text, "decoded_[1, 2, 3] generated text") @@ -123,7 +123,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): model_name="test_model", prompt_batched_token_ids=[[1], [2]], completion_batched_token_ids=[[1, 2], [3, 4]], - text_after_process_list=["prompt1", "prompt2"], + prompt_tokens_list=["prompt1", "prompt2"], ) self.assertEqual(len(response.choices), 2) @@ -159,7 +159,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase): model_name="test_model", prompt_batched_token_ids=[[1], [2]], completion_batched_token_ids=[[1, 2], [3, 4]], - text_after_process_list=["prompt1", "prompt2"], + prompt_tokens_list=["prompt1", "prompt2"], ) self.assertEqual(len(response.choices), 2) diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 61d5f88d4..b59a49fe2 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -160,7 +160,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): request_id="test-request-id", model_name="test-model", prompt_token_ids=[1, 2, 3], - text_after_process="Hello", + prompt_tokens="Hello", ) chunks = [] @@ -242,7 +242,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): model_name="test-model", created_time=11, prompt_batched_token_ids=[[1, 2, 3]], - text_after_process_list=["Hello"], + prompt_tokens_list=["Hello"], ) chunks = [] diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index f785797c6..2e35c12bd 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -54,8 +54,8 @@ INVALID_INPUT_BATCH = """ """ BATCH_RESPONSE = """ -{"id":"fastdeploy-7fcc30e2e4334fca806c4d01ee7ac4ab","custom_id":"req-00001","response":{"status_code":200,"request_id":"fastdeploy-batch-5f4017beded84b15aa3a8b0f1fce154c","body":{"id":"chatcmpl-33b09ae5-a8f1-40ad-9110-efa2b381eac9","object":"chat.completion","created":1758698637,"model":"/root/paddlejob/zhaolei36/ernie-4_5-0_3b-bf16-paddle","choices":[{"index":0,"message":{"role":"assistant","content":"In a sunlit meadow where dreams bloom,\\nA gentle breeze carries the breeze,\\nThe leaves rustle like ancient letters,\\nAnd in the sky, a song of hope and love.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":19,"total_tokens":60,"completion_tokens":41,"prompt_tokens_details":{"cached_tokens":0}}}},"error":null} -{"id":"fastdeploy-bf549849df2145598ae1758ba260f784","custom_id":"req-00002","response":{"status_code":200,"request_id":"fastdeploy-batch-81223f12fdc345efbfe85114ced10a1d","body":{"id":"chatcmpl-9479e36c-1542-45ff-b364-1dc6d34be9e7","object":"chat.completion","created":1758698637,"model":"/root/paddlejob/zhaolei36/ernie-4_5-0_3b-bf16-paddle","choices":[{"index":0,"message":{"role":"assistant","content":"Based on the given text, here are some possible actions you can take:\\n\\n1. **Read the question**: To understand what you can do, you can read the question (id=2) and analyze its requirements or constraints.\\n2. **Identify the keywords**: Look for specific keywords or phrases that describe what you can do. For example, if the question mentions \\"coding,\\" you can focus on coding skills or platforms.\\n3. **Brainstorm ideas**: You can think creatively about different ways to perform the action. For example, you could brainstorm different methods of communication, data analysis, or problem-solving.\\n4. **Explain your action**: If you have knowledge or skills in a particular area, you can explain how you would use those skills to achieve the desired outcome.\\n5. **Ask for help**: If you need assistance, you can ask for help from a friend, teacher, or mentor.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":17,"total_tokens":211,"completion_tokens":194,"prompt_tokens_details":{"cached_tokens":0}}}},"error":null} +{"id":"fastdeploy-7fcc30e2e4334fca806c4d01ee7ac4ab","custom_id":"req-00001","response":{"status_code":200,"request_id":"fastdeploy-batch-5f4017beded84b15aa3a8b0f1fce154c","body":{"id":"chatcmpl-33b09ae5-a8f1-40ad-9110-efa2b381eac9","object":"chat.completion","created":1758698637,"model":"/root/paddlejob/zhaolei36/ernie-4_5-0_3b-bf16-paddle","choices":[{"index":0,"message":{"role":"assistant","content":"In a sunlit meadow where dreams bloom,\\nA gentle breeze carries the breeze,\\nThe leaves rustle like ancient letters,\\nAnd in the sky, a song of hope and love.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":19,"total_tokens":60,"completion_tokens":41,"prompt_tokens_details":{"cached_tokens":0}}}},"error":null} +{"id":"fastdeploy-bf549849df2145598ae1758ba260f784","custom_id":"req-00002","response":{"status_code":200,"request_id":"fastdeploy-batch-81223f12fdc345efbfe85114ced10a1d","body":{"id":"chatcmpl-9479e36c-1542-45ff-b364-1dc6d34be9e7","object":"chat.completion","created":1758698637,"model":"/root/paddlejob/zhaolei36/ernie-4_5-0_3b-bf16-paddle","choices":[{"index":0,"message":{"role":"assistant","content":"Based on the given text, here are some possible actions you can take:\\n\\n1. **Read the question**: To understand what you can do, you can read the question (id=2) and analyze its requirements or constraints.\\n2. **Identify the keywords**: Look for specific keywords or phrases that describe what you can do. For example, if the question mentions \\"coding,\\" you can focus on coding skills or platforms.\\n3. **Brainstorm ideas**: You can think creatively about different ways to perform the action. For example, you could brainstorm different methods of communication, data analysis, or problem-solving.\\n4. **Explain your action**: If you have knowledge or skills in a particular area, you can explain how you would use those skills to achieve the desired outcome.\\n5. **Ask for help**: If you need assistance, you can ask for help from a friend, teacher, or mentor.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":17,"total_tokens":211,"completion_tokens":194,"prompt_tokens_details":{"cached_tokens":0}}}},"error":null} """ @@ -867,8 +867,6 @@ class TestFileOperations(unittest.TestCase): tool_calls=message_data["tool_calls"], prompt_token_ids=message_data["prompt_token_ids"], completion_token_ids=message_data["completion_token_ids"], - text_after_process=message_data["text_after_process"], - raw_prediction=message_data["raw_prediction"], prompt_tokens=message_data["prompt_tokens"], completion_tokens=message_data["completion_tokens"], ) diff --git a/tests/entrypoints/openai/test_serving_completion.py b/tests/entrypoints/openai/test_serving_completion.py index 690cefa47..f24d7664d 100644 --- a/tests/entrypoints/openai/test_serving_completion.py +++ b/tests/entrypoints/openai/test_serving_completion.py @@ -155,7 +155,7 @@ class TestOpenAIServingCompletion(unittest.TestCase): model_name=model_name, prompt_batched_token_ids=prompt_batched_token_ids, completion_batched_token_ids=completion_batched_token_ids, - text_after_process_list=["1", "1"], + prompt_tokens_list=["1", "1"], ) assert completion_response.id == request_id diff --git a/tests/input/test_ernie_processor.py b/tests/input/test_ernie_processor.py index b2357eeaa..04422ddbc 100644 --- a/tests/input/test_ernie_processor.py +++ b/tests/input/test_ernie_processor.py @@ -61,7 +61,7 @@ class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase): result = self.processor.process_response_dict_streaming(response_dict, **kwargs) # 验证结果 - self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") + self.assertEqual(result["outputs"]["completion_tokens"], "delta_text") def test_process_request_dict(self): request_dict = { diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py index 1bb088b27..43e8c5542 100644 --- a/tests/input/test_qwen_vl_processor.py +++ b/tests/input/test_qwen_vl_processor.py @@ -276,7 +276,7 @@ class TestQwenVLProcessor(unittest.TestCase): # Create equivalent request in prompt format prompt = { "request_id": "12345", - "prompt": request["text_after_process"], + "prompt": request["prompt_tokens"], "multimodal_data": { "image": [mock_pil_image(480, 640)], "video": [{"video": b"123"}], @@ -300,7 +300,7 @@ class TestQwenVLProcessor(unittest.TestCase): This test verifies that: - The processor correctly handles multimodal messages (image, video, text) - - The text_after_process field matches the output from direct tokenizer application + - The prompt_tokens field matches the output from direct tokenizer application - The chat template application preserves the message structure and content Test Steps: @@ -345,7 +345,7 @@ class TestQwenVLProcessor(unittest.TestCase): # Process request through the processor self.processor.process_request_dict(request, 1024 * 100) - prompt2 = request["text_after_process"] + prompt2 = request["prompt_tokens"] # Verify both methods produce identical prompt strings self.assertEqual(prompt, prompt2) diff --git a/tests/utils/test_custom_chat_template.py b/tests/utils/test_custom_chat_template.py index 1311289ff..b0b436ab1 100644 --- a/tests/utils/test_custom_chat_template.py +++ b/tests/utils/test_custom_chat_template.py @@ -62,7 +62,7 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase): ) async def mock_chat_completion_full_generator( - request, request_id, model_name, prompt_token_ids, text_after_process + request, request_id, model_name, prompt_token_ids, prompt_tokens ): return prompt_token_ids @@ -89,7 +89,7 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase): ) async def mock_chat_completion_full_generator( - request, request_id, model_name, prompt_token_ids, text_after_process + request, request_id, model_name, prompt_token_ids, prompt_tokens ): return prompt_token_ids