From ee9d8a840aad7a072dc8db7ba31aa2245fee1d5e Mon Sep 17 00:00:00 2001 From: luukunn <83932082+luukunn@users.noreply.github.com> Date: Fri, 19 Sep 2025 14:26:01 +0800 Subject: [PATCH] [fix]Modify follow-up push parameters and Modify the verification method for thinking length (#4086) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * add completion_token_ids * add logger * fix reasoning_max_tokens ParameterError * add unittest * add unittest * add unittest * add unittest * add unittest * add unit test --- fastdeploy/entrypoints/engine_client.py | 9 +++- fastdeploy/entrypoints/openai/protocol.py | 4 ++ .../ernie4_5_vl_processor.py | 14 +++--- .../qwen_vl_processor/qwen_vl_processor.py | 16 +++--- tests/e2e/test_EB_VL_Lite_serving.py | 50 +++++++++++++++++++ tests/input/test_qwen_vl_processor.py | 6 +-- 6 files changed, 75 insertions(+), 24 deletions(-) diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index de6eb9e91..5fe3f531e 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -236,8 +236,13 @@ class EngineClient: raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).") if data.get("reasoning_max_tokens") is not None: - if data["reasoning_max_tokens"] > data["max_tokens"] or data["reasoning_max_tokens"] < 1: - raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be between max_tokens and 1") + if data["reasoning_max_tokens"] < 1: + raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1") + if data["reasoning_max_tokens"] > data["max_tokens"]: + data["reasoning_max_tokens"] = data["max_tokens"] + api_server_logger.warning( + f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens" + ) # logprobs logprobs = data.get("logprobs") diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 5c6f32f71..81a41f016 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -593,6 +593,7 @@ class ChatCompletionRequest(BaseModel): prompt_token_ids: Optional[List[int]] = None max_streaming_response_tokens: Optional[int] = None disable_chat_template: Optional[bool] = False + completion_token_ids: Optional[List[int]] = None # doc: end-chat-completion-extra-params def to_dict_for_infer(self, request_id=None): @@ -618,6 +619,9 @@ class ChatCompletionRequest(BaseModel): ), "The parameter `raw_request` is not supported now, please use completion api instead." for key, value in self.metadata.items(): req_dict[key] = value + from fastdeploy.utils import api_server_logger + + api_server_logger.warning("The parameter metadata is obsolete.") for key, value in self.dict().items(): if value is not None: req_dict[key] = value diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index ce5187e3f..77690b920 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -241,10 +241,8 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor): else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - metadata = request.get("metadata") - # 如果metadata包含之前输出的token,将这些token添加到input_ids末尾 - if metadata and metadata.get("generated_token_ids"): - self.append_generated_tokens(outputs, metadata["generated_token_ids"]) + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) outputs = self.pack_outputs(outputs) request["prompt_token_ids"] = outputs["input_ids"].tolist() request["prompt_token_ids_len"] = len(request["prompt_token_ids"]) @@ -259,11 +257,11 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor): return request - def append_generated_tokens(self, multimodal_inputs, generated_token_ids): - "append already generated tokens" + def append_completion_tokens(self, multimodal_inputs, completion_token_ids): + "append already completion tokens" - num_tokens = len(generated_token_ids) - multimodal_inputs["input_ids"].extend(generated_token_ids) + num_tokens = len(completion_token_ids) + multimodal_inputs["input_ids"].extend(completion_token_ids) multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) start = multimodal_inputs["cur_position"] diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index a3adeddf1..00856ec01 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -245,15 +245,11 @@ class QwenVLProcessor(TextProcessor): else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - metadata = request.get("metadata") # Handle continuation of previous generation by appending existing tokens - if metadata and metadata.get("generated_token_ids"): - self.append_generated_tokens(outputs, metadata["generated_token_ids"]) + if request.get("completion_token_ids"): + self.append_completion_tokens(outputs, request["completion_token_ids"]) enable_thinking = False - if metadata: - enable_thinking = metadata.get("enable_thinking", False) - if request.get("chat_template_kwargs"): chat_template_kwargs = request.get("chat_template_kwargs") enable_thinking = chat_template_kwargs.get("enable_thinking", False) @@ -278,16 +274,16 @@ class QwenVLProcessor(TextProcessor): return request - def append_generated_tokens(self, outputs, generated_token_ids): + def append_completion_tokens(self, outputs, completion_token_ids): """ - Append generated tokens to existing outputs. + Append completion tokens to existing outputs. Args: outputs: Current model outputs - generated_token_ids: Generated tokens to append + completion_token_ids: completion tokens to append """ out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]} - self.processor._add_text(generated_token_ids, out) + self.processor._add_text(completion_token_ids, out) outputs["input_ids"] = np.concatenate( [outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0 diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index af6daf246..027e663fe 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -255,6 +255,16 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): assert content1 == content2 +def test_with_metadata(api_url, headers, consistent_payload): + """ + Test that result is same as the base result. + """ + # request + consistent_payload["metadata"] = {"enable_thinking": True} + resp1 = requests.post(api_url, headers=headers, json=consistent_payload) + assert resp1.status_code == 200 + + # ========================== # OpenAI Client Chat Completion Test # ========================== @@ -555,6 +565,46 @@ def test_chat_with_thinking(openai_client, capsys): assert reasoning_tokens <= reasoning_max_tokens +def test_chat_with_completion_token_ids(openai_client): + """Test completion_token_ids""" + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello"}], + extra_body={ + "completion_token_ids": [94936], + "return_token_ids": True, + "reasoning_max_tokens": 20, + "max_tokens": 10, + }, + max_tokens=10, + stream=False, + ) + assert hasattr(response, "choices") + assert len(response.choices) > 0 + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "prompt_token_ids") + assert isinstance(response.choices[0].message.prompt_token_ids, list) + assert 94936 in response.choices[0].message.prompt_token_ids + + +def test_chat_with_reasoning_max_tokens(openai_client): + """Test completion_token_ids""" + assertion_executed = False + try: + openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Hello"}], + extra_body={"completion_token_ids": [18900], "return_token_ids": True, "reasoning_max_tokens": -1}, + max_tokens=10, + stream=False, + ) + except openai.InternalServerError as e: + error_message = str(e) + assertion_executed = True + assert "reasoning_max_tokens must be greater than 1" in error_message + assert assertion_executed, "Assertion was not executed (no exception raised)" + + def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_file = "./log/config.log" diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py index 4936bc7a5..1bb088b27 100644 --- a/tests/input/test_qwen_vl_processor.py +++ b/tests/input/test_qwen_vl_processor.py @@ -176,12 +176,10 @@ class TestQwenVLProcessor(unittest.TestCase): 3. Video processing produces expected output dimensions 4. Correct counts for images (1) and videos (1) """ - num_generated_token_ids = 10 + num_completion_token_ids = 10 request = { "request_id": "12345", - "metadata": { - "generated_token_ids": [1] * num_generated_token_ids, - }, + "completion_token_ids": [1] * num_completion_token_ids, "stop": ["stop", "eof"], "messages": [ {