[Feature] Add AsyncTokenizerClient&ChatResponseProcessor with remote encode&decode support. (#3674)

* [Feature] add AsyncTokenizerClient * add decode_image * Add response_processors with remote decode support. * [Feature] add tokenizer_base_url startup argument * Revert comment removal and restore original content. * [Feature] Non-streaming requests now support remote image decoding. * Fix parameter type issue in decode_image call. * Keep completion_token_ids when return_token_ids = False. * add copyright
2025-09-26 20:41:53 +08:00 · 2025-08-30 17:06:26 +08:00
parent 9a7c231f2c
commit b9af95cf1c
13 changed files with 757 additions and 25 deletions
--- a/fastdeploy/demo/tokenzier_client_demo.py
+++ b/fastdeploy/demo/tokenzier_client_demo.py
@@ -0,0 +1,74 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+
+from fastdeploy.input.tokenzier_client import (
+    AsyncTokenizerClient,
+    ImageDecodeRequest,
+    ImageEncodeRequest,
+    VideoEncodeRequest,
+)
+
+
+async def main():
+    """
+    测试AsyncTokenizerClient类
+    """
+    base_url = "http://example.com/"
+
+    client = AsyncTokenizerClient(base_url=base_url)
+
+    # # 测试图片编码请求
+    image_encode_request = ImageEncodeRequest(
+        version="v1", req_id="req_image_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg"
+    )
+
+    image_encode_ret = await client.encode_image(image_encode_request)
+    print(f"Image encode result:{image_encode_ret}")
+
+    # 测试视频编码请求
+    video_encode_req = VideoEncodeRequest(
+        version="v1",
+        req_id="req_video_001",
+        video_url="http://example.com/video.mp4",
+        is_gen=False,
+        resolution=1024,
+        start_ts=0,
+        end_ts=5,
+        frames=1,
+    )
+    video_encode_result = await client.encode_video(video_encode_req)
+    print(f"Video Encode Result:{video_encode_result}")
+    # 测试图片解码请求
+    with open("./image_decode_demo.json", "r", encoding="utf-8") as file:
+        import json
+        import time
+
+        start_time = time.time()
+        start_process_time = time.process_time()  # 记录开始时间
+        json_data = json.load(file)
+        image_decoding_request = ImageDecodeRequest(req_id="req_image_001", data=json_data.get("data"))
+        # import pdb; pdb.set_trace()
+        image_decode_result = await client.decode_image(image_decoding_request)
+        print(f"Image decode result:{image_decode_result}")
+        elapsed_time = time.time() - start_time
+        elapsed_process_time = time.process_time() - start_process_time
+        print(f"decode elapsed_time: {elapsed_time:.6f}s, elapsed_process_time: {elapsed_process_time:.6f}s")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -71,6 +71,10 @@ class EngineArgs:
    """
    The name or path of the tokenizer (defaults to model path if not provided).
    """
+    tokenizer_base_url: str = None
+    """
+    The base URL of the remote tokenizer service (used instead of local tokenizer if provided).
+    """
    max_model_len: int = 2048
    """
    Maximum context length supported by the model.
@@ -426,6 +430,12 @@ class EngineArgs:
            default=EngineArgs.tokenizer,
            help="Tokenizer name or path (defaults to model path if not specified).",
        )
+        model_group.add_argument(
+            "--tokenizer-base-url",
+            type=nullable_str,
+            default=EngineArgs.tokenizer_base_url,
+            help="The base URL of the remote tokenizer service (used instead of local tokenizer if provided).",
+        )
        model_group.add_argument(
            "--max-model-len",
            type=int,
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -77,6 +77,9 @@ parser.add_argument(
    help="max waiting time for connection, if set value -1 means no waiting time limit",
 )
 parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
+parser.add_argument(
+    "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "
+)
 parser = EngineArgs.add_cli_args(parser)
 args = parser.parse_args()
 args.model = retrive_model_from_server(args.model, args.revision)
@@ -176,7 +179,14 @@ async def lifespan(app: FastAPI):
    )
    app.state.model_handler = model_handler
    chat_handler = OpenAIServingChat(
-        engine_client, app.state.model_handler, pid, args.ips, args.max_waiting_time, chat_template
+        engine_client,
+        app.state.model_handler,
+        pid,
+        args.ips,
+        args.max_waiting_time,
+        chat_template,
+        args.enable_mm_output,
+        args.tokenizer_base_url,
    )
    completion_handler = OpenAIServingCompletion(
        engine_client,
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -163,8 +163,9 @@ class ChatMessage(BaseModel):
    Chat message.
    """

-    role: str
-    content: str
+    role: Optional[str] = None
+    content: Optional[str] = None
+    multimodal_content: Optional[List[Any]] = None
    reasoning_content: Optional[str] = None
    tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
    prompt_token_ids: Optional[List[int]] = None
@@ -226,6 +227,7 @@ class DeltaMessage(BaseModel):

    role: Optional[str] = None
    content: Optional[str] = None
+    multimodal_content: Optional[List[Any]] = None
    prompt_token_ids: Optional[List[int]] = None
    completion_token_ids: Optional[List[int]] = None
    reasoning_content: Optional[str] = None
--- a/fastdeploy/entrypoints/openai/response_processors.py
+++ b/fastdeploy/entrypoints/openai/response_processors.py
@@ -0,0 +1,145 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any, List, Optional
+
+from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
+
+
+class ChatResponseProcessor:
+    """
+    A decoder class to build multimodal content (text/image) from token_ids.
+
+    Attributes:
+        eoi_token_id: Token ID indicating the end of an image (<eoi>).
+    """
+
+    def __init__(
+        self,
+        data_processor,
+        enable_mm_output: Optional[bool] = False,
+        eoi_token_id: Optional[int] = 101032,
+        eos_token_id: Optional[int] = 2,
+        decoder_base_url: Optional[str] = None,
+    ):
+        self.data_processor = data_processor
+        self.enable_mm_output = enable_mm_output
+        self.eoi_token_id = eoi_token_id
+        self.eos_token_id = eos_token_id
+        if decoder_base_url is not None:
+            self.decoder_client = AsyncTokenizerClient(base_url=decoder_base_url)
+        self._mm_buffer: List[Any] = []  # Buffer for accumulating image token_ids
+        self._end_image_code_request_output: Optional[Any] = None
+        self._multipart_buffer = []
+
+    def enable_multimodal_content(self):
+        return self.enable_mm_output
+
+    def accumulate_token_ids(self, request_output):
+        decode_type = request_output["outputs"].get("decode_type", 0)
+
+        if not self._multipart_buffer:
+            self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
+        else:
+            last_part = self._multipart_buffer[-1]
+
+            if last_part["decode_type"] == decode_type:
+                last_token_ids = last_part["request_output"]["outputs"]["token_ids"]
+                last_token_ids.extend(request_output["outputs"]["token_ids"])
+                request_output["outputs"]["token_ids"] = last_token_ids
+                last_part["request_output"] = request_output
+            else:
+                self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
+
+    async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
+        """
+        Process a list of responses into a generator that yields each processed response as it's generated.
+        Args:
+            request_outputs: The list of outputs to be processed.
+            stream: Whether or not to stream the output.
+            enable_thinking: Whether or not to show thinking messages.
+            include_stop_str_in_output: Whether or not to include stop strings in the output.
+        """
+        for request_output in request_outputs:
+            if not self.enable_mm_output:
+                yield self.data_processor.process_response_dict(
+                    response_dict=request_output,
+                    stream=stream,
+                    enable_thinking=enable_thinking,
+                    include_stop_str_in_output=include_stop_str_in_output,
+                )
+            elif stream:
+                decode_type = request_output["outputs"].get("decode_type", 0)
+                token_ids = request_output["outputs"]["token_ids"]
+                if decode_type == 0:
+                    if self.eoi_token_id and self.eoi_token_id in token_ids:
+                        if self._mm_buffer:
+                            all_tokens = self._mm_buffer
+                            self._mm_buffer = []
+                            image = {"type": "image"}
+                            if self.decoder_client:
+                                req_id = request_output["request_id"]
+                                image_ret = await self.decoder_client.decode_image(
+                                    request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
+                                )
+                                image["url"] = image_ret["http_url"]
+                            image_output = self._end_image_code_request_output
+                            image_output["outputs"]["multipart"] = [image]
+                            image_output["outputs"]["token_ids"] = all_tokens
+                            yield image_output
+
+                    self.data_processor.process_response_dict(
+                        response_dict=request_output,
+                        stream=stream,
+                        enable_thinking=enable_thinking,
+                        include_stop_str_in_output=include_stop_str_in_output,
+                    )
+                    text = {"type": "text", "text": request_output["outputs"]["text"]}
+                    request_output["outputs"]["multipart"] = [text]
+                    yield request_output
+
+                elif decode_type == 1:
+                    self._mm_buffer.extend(token_ids)
+                    self._end_image_code_request_output = request_output
+            else:
+                self.accumulate_token_ids(request_output)
+                token_ids = request_output["outputs"]["token_ids"]
+                if token_ids[-1] == self.eos_token_id:
+                    multipart = []
+                    for part in self._multipart_buffer:
+                        if part["decode_type"] == 0:
+                            self.data_processor.process_response_dict(
+                                response_dict=part["request_output"],
+                                stream=False,
+                                enable_thinking=enable_thinking,
+                                include_stop_str_in_output=include_stop_str_in_output,
+                            )
+                            text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}
+                            multipart.append(text)
+                        elif part["decode_type"] == 1:
+                            image = {"type": "image"}
+                            if self.decoder_client:
+                                req_id = part["request_output"]["request_id"]
+                                all_tokens = part["request_output"]["outputs"]["token_ids"]
+                                image_ret = await self.decoder_client.decode_image(
+                                    request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
+                                )
+                                image["url"] = image_ret["http_url"]
+                            multipart.append(image)
+
+                    lasrt_request_output = self._multipart_buffer[-1]["request_output"]
+                    lasrt_request_output["outputs"]["multipart"] = multipart
+                    yield lasrt_request_output
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -36,6 +36,7 @@ from fastdeploy.entrypoints.openai.protocol import (
    PromptTokenUsageInfo,
    UsageInfo,
 )
+from fastdeploy.entrypoints.openai.response_processors import ChatResponseProcessor
 from fastdeploy.metrics.work_metrics import work_process_metrics
 from fastdeploy.utils import api_server_logger
 from fastdeploy.worker.output import LogprobsLists
@@ -46,12 +47,24 @@ class OpenAIServingChat:
    OpenAI-style chat completions serving
    """

-    def __init__(self, engine_client, models, pid, ips, max_waiting_time, chat_template):
+    def __init__(
+        self,
+        engine_client,
+        models,
+        pid,
+        ips,
+        max_waiting_time,
+        chat_template,
+        enable_mm_output: Optional[bool] = False,
+        tokenizer_base_url: Optional[str] = None,
+    ):
        self.engine_client = engine_client
        self.models = models
        self.pid = pid
        self.max_waiting_time = max_waiting_time
        self.chat_template = chat_template
+        self.enable_mm_output = enable_mm_output
+        self.tokenizer_base_url = tokenizer_base_url
        if ips is not None:
            if isinstance(ips, list):
                self.master_ip = ips[0]
@@ -198,6 +211,11 @@ class OpenAIServingChat:
            dealer.write([b"", request_id.encode("utf-8")])
            choices = []
            current_waiting_time = 0
+            response_processor = ChatResponseProcessor(
+                data_processor=self.engine_client.data_processor,
+                enable_mm_output=self.enable_mm_output,
+                decoder_base_url=self.tokenizer_base_url,
+            )
            while num_choices > 0:
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
@@ -215,17 +233,18 @@ class OpenAIServingChat:
                            current_waiting_time = 0
                    await asyncio.sleep(0.01)
                    continue
-                for res in response:
+
+                generator = response_processor.process_response_chat(
+                    response,
+                    stream=True,
+                    enable_thinking=enable_thinking,
+                    include_stop_str_in_output=include_stop_str_in_output,
+                )
+
+                async for res in generator:
                    if res.get("error_code", 200) != 200:
                        raise ValueError("{}".format(res["error_msg"]))

-                    self.engine_client.data_processor.process_response_dict(
-                        res,
-                        stream=True,
-                        enable_thinking=enable_thinking,
-                        include_stop_str_in_output=include_stop_str_in_output,
-                    )
-
                    if res["metrics"]["first_token_time"] is not None:
                        arrival_time = res["metrics"]["first_token_time"]
                        inference_start_time = res["metrics"]["inference_start_time"]
@@ -239,13 +258,22 @@ class OpenAIServingChat:
                                index=i,
                                delta=DeltaMessage(
                                    role="assistant",
-                                    content="",
                                    reasoning_content="",
                                    tool_calls=None,
                                    prompt_token_ids=None,
                                    completion_token_ids=None,
                                ),
                            )
+                            if response_processor.enable_multimodal_content():
+                                choice.delta.multimodal_content = [
+                                    {
+                                        "type": "text",
+                                        "text": "",
+                                    }
+                                ]
+                            else:
+                                choice.delta.content = ""
+
                            if request.return_token_ids:
                                choice.delta.prompt_token_ids = list(prompt_token_ids)
                                choice.delta.text_after_process = text_after_process
@@ -269,7 +297,6 @@ class OpenAIServingChat:
                        first_iteration = False

                    output = res["outputs"]
-                    delta_text = output["text"]
                    output_top_logprobs = output["top_logprobs"]
                    previous_num_tokens += len(output["token_ids"])
                    logprobs_res: Optional[LogProbs] = None
@@ -279,12 +306,17 @@ class OpenAIServingChat:
                        )

                    delta_message = DeltaMessage(
-                        content=delta_text,
                        reasoning_content="",
                        prompt_token_ids=None,
-                        completion_token_ids=None,
                        tool_calls=None,
+                        completion_token_ids=None,
                    )
+
+                    if response_processor.enable_multimodal_content():
+                        delta_message.multimodal_content = output["multipart"]
+                    else:
+                        delta_message.content = output["text"]
+
                    if not res["finished"] and "delta_message" in output:
                        delta_message_output = output["delta_message"]
                        if delta_message_output is None:
@@ -317,7 +349,10 @@ class OpenAIServingChat:
                            choice.finish_reason = "recover_stop"

                    if request.return_token_ids:
-                        choice.delta.completion_token_ids = list(output["token_ids"])
+                        if response_processor.enable_multimodal_content():
+                            choice.delta.multimodal_content[0]["completion_token_ids"] = list(output["token_ids"])
+                        else:
+                            choice.delta.completion_token_ids = list(output["token_ids"])
                        choice.delta.raw_prediction = output.get("raw_prediction")
                        choice.delta.completion_tokens = output.get("raw_prediction")
                    if include_continuous_usage:
@@ -395,6 +430,11 @@ class OpenAIServingChat:
            current_waiting_time = 0
            logprob_contents = []
            completion_token_ids = []
+            response_processor = ChatResponseProcessor(
+                data_processor=self.engine_client.data_processor,
+                enable_mm_output=self.enable_mm_output,
+                decoder_base_url=self.tokenizer_base_url,
+            )
            while True:
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
@@ -411,15 +451,16 @@ class OpenAIServingChat:
                    continue

                task_is_finished = False
-                for data in response:
+
+                generator = response_processor.process_response_chat(
+                    response,
+                    stream=False,
+                    enable_thinking=enable_thinking,
+                    include_stop_str_in_output=include_stop_str_in_output,
+                )
+                async for data in generator:
                    if data.get("error_code", 200) != 200:
                        raise ValueError("{}".format(data["error_msg"]))
-                    data = self.engine_client.data_processor.process_response_dict(
-                        data,
-                        stream=False,
-                        enable_thinking=enable_thinking,
-                        include_stop_str_in_output=include_stop_str_in_output,
-                    )
                    # api_server_logger.debug(f"Client {request_id} received: {data}")
                    previous_num_tokens += len(data["outputs"]["token_ids"])
                    completion_token_ids.extend(data["outputs"]["token_ids"])
@@ -447,7 +488,6 @@ class OpenAIServingChat:
        output = final_res["outputs"]
        message = ChatMessage(
            role="assistant",
-            content=output["text"],
            reasoning_content=output.get("reasoning_content"),
            tool_calls=output.get("tool_call"),
            prompt_token_ids=prompt_token_ids if request.return_token_ids else None,
@@ -457,6 +497,12 @@ class OpenAIServingChat:
            raw_prediction=output.get("raw_prediction") if request.return_token_ids else None,
            completion_tokens=output.get("raw_prediction") if request.return_token_ids else None,
        )
+
+        if response_processor.enable_multimodal_content():
+            message.multimodal_content = output.get("multipart")
+        else:
+            message.content = output["text"]
+
        logprobs_full_res = None
        if logprob_contents:
            logprobs_full_res = LogProbs(content=logprob_contents)
--- a/fastdeploy/input/tokenzier_client.py
+++ b/fastdeploy/input/tokenzier_client.py
@@ -0,0 +1,163 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+from typing import Any, Optional, Union
+
+import httpx
+from pydantic import BaseModel, HttpUrl
+
+from fastdeploy.utils import data_processor_logger
+
+
+class BaseEncodeRequest(BaseModel):
+    version: str
+    req_id: str
+    is_gen: bool
+    resolution: int
+
+
+class ImageEncodeRequest(BaseEncodeRequest):
+    image_url: Union[str, HttpUrl]
+
+
+class VideoEncodeRequest(BaseEncodeRequest):
+    video_url: Union[str, HttpUrl]
+    start_ts: int
+    end_ts: int
+    frames: int
+
+
+class ImageDecodeRequest(BaseModel):
+    req_id: str
+    data: list[Any]
+
+
+class AsyncTokenizerClient:
+    def __init__(
+        self,
+        base_url: Optional[str] = None,
+        timeout: float = 5.0,
+        poll_interval: float = 0.5,
+        max_wait: float = 60.0,
+    ):
+        """
+        :param mode: 'local' 或 'remote'
+        :param base_url: 远程服务地址
+        :param timeout: 单次 HTTP 请求超时（秒）
+        :param poll_interval: 查询结果的轮询间隔（秒）
+        :param max_wait: 最大等待时间（秒）
+        """
+        self.base_url = base_url
+        self.timeout = timeout
+        self.poll_interval = poll_interval
+        self.max_wait = max_wait
+
+    async def encode_image(self, request: ImageEncodeRequest):
+        return await self._async_encode_request("image", request.__dict__)
+
+    async def encode_video(self, request: VideoEncodeRequest):
+        return await self._async_encode_request("video", request.__dict__)
+
+    async def decode_image(self, request: ImageDecodeRequest):
+        return await self._async_decode_request("image", request.__dict__)
+
+    async def log_request(self, request):
+        data_processor_logger.debug(f">>> Request: {request.method} {request.url}")
+        data_processor_logger.debug(f">>> Headers: {request.headers}")
+        if request.content:
+            data_processor_logger.debug(f">>> Content: {request.content.decode('utf-8')}")
+
+    async def log_response(self, response):
+        data_processor_logger.debug(f"<<< Response status: {response.status_code}")
+        data_processor_logger.debug(f"<<< Headers: {response.headers}")
+
+    async def _async_encode_request(self, type: str, request: dict):
+        if not self.base_url:
+            raise ValueError("Missing base_url")
+
+        async with httpx.AsyncClient(
+            timeout=self.timeout, event_hooks={"request": [self.log_request], "response": [self.log_response]}
+        ) as client:
+            req_id = request.get("req_id")
+            try:
+                url = None
+                if type == "image":
+                    url = f"{self.base_url}/image/encode"
+                elif type == "video":
+                    url = f"{self.base_url}/video/encode"
+                else:
+                    raise ValueError("Invalid type")
+
+                resp = await client.post(url, json=request)
+                resp.raise_for_status()
+            except httpx.RequestError as e:
+                raise RuntimeError(f"Failed to create tokenize task: {e}") from e
+
+            task_info = resp.json()
+            if task_info.get("code") != 0:
+                raise RuntimeError(f"Tokenize task creation failed, {task_info.get('message')}")
+
+            task_tag = task_info.get("task_tag")
+            if not task_tag:
+                raise RuntimeError("No task_tag returned from server")
+
+            # 2. 轮询结果
+            start_time = asyncio.get_event_loop().time()
+            while True:
+                try:
+                    r = await client.get(
+                        f"{self.base_url}/encode/get", params={"task_tag": task_tag, "req_id": req_id}
+                    )
+                    r.raise_for_status()
+                    data = r.json()
+
+                    # 异步encode任务当前执行状态: Processing, Finished, Error
+                    if data.get("state") == "Finished":
+                        return data.get("result")
+                    elif data.get("state") == "Error":
+                        raise RuntimeError(f"Tokenize task failed: {data.get('message')}")
+
+                except httpx.RequestError:
+                    # 网络问题时继续轮询
+                    pass
+
+                # 超时检测
+                if asyncio.get_event_loop().time() - start_time > self.max_wait:
+                    raise TimeoutError(f"Tokenize task {task_tag} timed out after {self.max_wait}s")
+
+                await asyncio.sleep(self.poll_interval)
+
+    async def _async_decode_request(self, type: str, request: dict):
+        if not self.base_url:
+            raise ValueError("Missing base_url")
+
+        async with httpx.AsyncClient(
+            timeout=self.timeout, event_hooks={"request": [self.log_request], "response": [self.log_response]}
+        ) as client:
+            try:
+                url = None
+                if type == "image":
+                    url = f"{self.base_url}/image/decode"
+                else:
+                    raise ValueError("Invalid type")
+                resp = await client.post(url, json=request)
+                resp.raise_for_status()
+                if resp.json().get("code") != 0:
+                    raise RuntimeError(f"Tokenize task creation failed, {resp.json().get('message')}")
+                return resp.json().get("result")
+            except httpx.RequestError as e:
+                raise RuntimeError(f"Failed to decode: {e}") from e
--- a/test/input/test_tokenizer_client.py
+++ b/test/input/test_tokenizer_client.py
@@ -0,0 +1,84 @@
+import httpx
+import pytest
+import respx
+
+from fastdeploy.input.tokenzier_client import (
+    AsyncTokenizerClient,
+    ImageEncodeRequest,
+    VideoEncodeRequest,
+)
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_encode_image_success():
+    base_url = "http://testserver"
+    client = AsyncTokenizerClient(base_url=base_url)
+
+    # Mock 创建任务接口
+    respx.post(f"{base_url}/image/encode").mock(
+        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task123"})
+    )
+    # Mock 轮询接口，返回完成状态
+    mock_get_ret = {
+        "state": "Finished",
+        "result": {"feature_url": "bos://host:port/key", "feature_shape": [80, 45, 1563]},
+    }
+    respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json=mock_get_ret))
+
+    request = ImageEncodeRequest(
+        version="v1", req_id="req_img_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg"
+    )
+
+    result = await client.encode_image(request)
+    assert result["feature_url"] == "bos://host:port/key"
+    assert result["feature_shape"] == [80, 45, 1563]
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_encode_video_failure():
+    base_url = "http://testserver"
+    client = AsyncTokenizerClient(base_url=base_url, max_wait=1)
+
+    respx.post(f"{base_url}/video/encode").mock(
+        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_vid_001"})
+    )
+    # 模拟轮询接口失败状态
+    respx.get(f"{base_url}/encode/get").mock(
+        return_value=httpx.Response(200, json={"state": "Error", "message": "Encode failed"})
+    )
+
+    request = VideoEncodeRequest(
+        version="v1",
+        req_id="req_vid_001",
+        is_gen=True,
+        resolution=720,
+        video_url="http://example.com/video.mp4",
+        start_ts=0.0,
+        end_ts=10.0,
+        frames=30,
+    )
+
+    with pytest.raises(RuntimeError, match="Encode failed"):
+        await client.encode_video(request)
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_encode_timeout():
+    base_url = "http://testserver"
+    client = AsyncTokenizerClient(base_url=base_url, max_wait=1, poll_interval=0.1)
+
+    respx.post(f"{base_url}/image/encode").mock(
+        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_timeout"})
+    )
+    # 模拟轮询接口一直返回等待状态，导致超时
+    respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json={"status": "processing"}))
+
+    request = ImageEncodeRequest(
+        version="v1", req_id="req_img_timeout", is_gen=False, resolution=256, image_url="http://example.com/image.jpg"
+    )
+
+    with pytest.raises(TimeoutError):
+        await client.encode_image(request)
--- a/tests/entrypoints/openai/test_build_sample_logprobs.py
+++ b/tests/entrypoints/openai/test_build_sample_logprobs.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import unittest
 from unittest.mock import MagicMock, patch

--- a/tests/entrypoints/openai/test_completion_echo.py
+++ b/tests/entrypoints/openai/test_completion_echo.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import unittest
 from unittest.mock import MagicMock, patch

--- a/tests/entrypoints/openai/test_response_processors.py
+++ b/tests/entrypoints/openai/test_response_processors.py
@@ -0,0 +1,134 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+from unittest.mock import AsyncMock, MagicMock
+
+from fastdeploy.entrypoints.openai.response_processors import ChatResponseProcessor
+
+
+class TestChatResponseProcessor(unittest.IsolatedAsyncioTestCase):
+
+    def setUp(self):
+        self.mock_data_processor = MagicMock()
+        self.mock_data_processor.process_response_dict = MagicMock(
+            side_effect=lambda response_dict, **_: {"processed": True, "raw": response_dict}
+        )
+
+    async def asyncSetUp(self):
+        self.processor_mm = ChatResponseProcessor(
+            data_processor=self.mock_data_processor,
+            enable_mm_output=True,
+            eoi_token_id=101032,
+            eos_token_id=2,
+            decoder_base_url="http://fake-decoder",
+        )
+        self.processor_mm.decoder_client.decode_image = AsyncMock(
+            return_value={"http_url": "http://image.url/test.png"}
+        )
+
+    async def test_text_only_mode(self):
+        """不开启 multimodal 时，直接走 data_processor"""
+        processor = ChatResponseProcessor(self.mock_data_processor)
+        request_outputs = [{"outputs": {"text": "hello"}}]
+
+        results = [
+            r
+            async for r in processor.process_response_chat(
+                request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False
+            )
+        ]
+
+        self.mock_data_processor.process_response_dict.assert_called_once()
+        self.assertEqual(results[0]["processed"], True)
+        self.assertEqual(results[0]["raw"]["outputs"]["text"], "hello")
+
+    async def test_streaming_text_and_image(self):
+        """流式模式下：text → image → text"""
+        request_outputs = [
+            {"request_id": "req1", "outputs": {"decode_type": 0, "token_ids": [1], "text": "hi"}},
+            {"request_id": "req1", "outputs": {"decode_type": 1, "token_ids": [[11, 22]]}},
+            {"request_id": "req1", "outputs": {"decode_type": 0, "token_ids": [101032], "text": "done"}},
+        ]
+
+        results = [
+            r
+            async for r in self.processor_mm.process_response_chat(
+                request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False
+            )
+        ]
+
+        # 第一个 yield：text
+        text_part = results[0]["outputs"]["multipart"][0]
+        self.assertEqual(text_part["type"], "text")
+        self.assertEqual(text_part["text"], "hi")
+
+        # 第二个 yield：image（token_ids 被拼起来了）
+        image_part = results[1]["outputs"]["multipart"][0]
+        self.assertEqual(image_part["type"], "image")
+        self.assertEqual(image_part["url"], "http://image.url/test.png")
+        self.assertEqual(results[1]["outputs"]["token_ids"], [[11, 22]])
+
+        # 第三个 yield：text
+        text_part = results[2]["outputs"]["multipart"][0]
+        self.assertEqual(text_part["type"], "text")
+        self.assertEqual(text_part["text"], "done")
+
+    async def test_streaming_buffer_accumulation(self):
+        """流式模式：decode_type=1 只累积 buffer，不 yield"""
+        request_outputs = [{"request_id": "req2", "outputs": {"decode_type": 1, "token_ids": [[33, 44]]}}]
+
+        results = [
+            r
+            async for r in self.processor_mm.process_response_chat(
+                request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False
+            )
+        ]
+
+        self.assertEqual(results, [])
+        self.assertEqual(self.processor_mm._mm_buffer, [[33, 44]])
+
+    async def test_non_streaming_accumulate_and_emit(self):
+        """非流式模式：等 eos_token_id 才输出 multipart（text+image）"""
+        request_outputs = [
+            {"request_id": "req3", "outputs": {"decode_type": 0, "token_ids": [10], "text": "hello"}},
+            {"request_id": "req3", "outputs": {"decode_type": 1, "token_ids": [[55, 66]]}},
+            {"request_id": "req3", "outputs": {"decode_type": 0, "token_ids": [2], "text": "bye"}},  # eos_token_id
+        ]
+
+        results = [
+            r
+            async for r in self.processor_mm.process_response_chat(
+                request_outputs, stream=False, enable_thinking=False, include_stop_str_in_output=False
+            )
+        ]
+
+        # 只在最后一个输出 yield
+        self.assertEqual(len(results), 1)
+        multipart = results[0]["outputs"]["multipart"]
+
+        self.assertEqual(multipart[0]["type"], "text")
+        self.assertEqual(multipart[0]["text"], "hello")
+
+        self.assertEqual(multipart[1]["type"], "image")
+        self.assertEqual(multipart[1]["url"], "http://image.url/test.png")
+
+        self.assertEqual(multipart[2]["type"], "text")
+        self.assertEqual(multipart[2]["text"], "bye")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/entrypoints/openai/test_serving_completion.py
+++ b/tests/entrypoints/openai/test_serving_completion.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import unittest
 from typing import List
 from unittest.mock import Mock
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -1,3 +1,19 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import asyncio
 import unittest