Sync v2.0 version of code to github repo

2025-09-30 22:32:30 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -23,6 +23,7 @@ import numpy as np
 from fastdeploy.input.preprocess import InputPreprocessor
 from fastdeploy.engine.request import Request
 from fastdeploy.inter_communicator import ZmqClient, IPCSignal
+from fastdeploy.metrics.work_metrics import work_process_metrics
 from fastdeploy.utils import api_server_logger, EngineError


@@ -30,9 +31,16 @@ class EngineClient:
    """
    EngineClient is a class that handles the communication between the client and the server.
    """
-    def __init__(self, tokenizer, max_model_len, tensor_parallel_size, pid, enable_mm=False):
-        input_processor =  InputPreprocessor(tokenizer, enable_mm)
+
+    def __init__(self, tokenizer, max_model_len, tensor_parallel_size, pid, limit_mm_per_prompt, mm_processor_kwargs,
+                 enable_mm=False, reasoning_parser=None):
+        input_processor = InputPreprocessor(tokenizer,
+                                            reasoning_parser,
+                                            limit_mm_per_prompt,
+                                            mm_processor_kwargs,
+                                            enable_mm)
        self.enable_mm = enable_mm
+        self.reasoning_parser = reasoning_parser
        self.data_processor = input_processor.create_processor()
        self.max_model_len = max_model_len
        self.worker_healthy_live_recorded_time_array = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
@@ -73,6 +81,7 @@ class EngineClient:
            prompts["max_tokens"] = self.max_model_len - 1

        self.add_requests(prompts)
+        return prompts["prompt_token_ids"]

    def add_requests(self, task):
        """
@@ -85,7 +94,6 @@ class EngineClient:
        Returns:
            None
        """
-        self.vaild_parameters(task)

        task["preprocess_start_time"] = time.time()
        try:
@@ -94,7 +102,15 @@ class EngineClient:
            task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
            input_ids_len = task["prompt_token_ids_len"]
            task["max_tokens"] = min(self.max_model_len - input_ids_len , task.get("max_tokens"))
+            if task.get("reasoning_max_tokens", None) is None:
+                task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1)
            min_tokens = task.get("min_tokens", 1)
+            if 'messages' in task:
+                del task['messages']
+            api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
+            work_process_metrics.request_params_max_tokens.observe(task["max_tokens"])
+            work_process_metrics.prompt_tokens_total.inc(input_ids_len)
+            work_process_metrics.request_prompt_tokens.observe(input_ids_len)
        except Exception as e:
            api_server_logger.error(e)
            raise EngineError(str(e), error_code=400)
@@ -102,7 +118,7 @@ class EngineClient:
        if input_ids_len + min_tokens >= self.max_model_len:
            error_msg = (
                f"Input text is too long, input_ids_len ({input_ids_len}) "
-                f"+ min_dec_len ({min_tokens}) >= max_model_len "
+                f"+ min_tokens({min_tokens}) >= max_model_len({self.max_model_len})"
            )
            api_server_logger.error(error_msg)
            raise EngineError(error_msg, error_code=400)
@@ -120,6 +136,8 @@ class EngineClient:
            f"Cache request with request_id ({task.get('request_id')}), "
            f"cost {time.time() - preprocess_cost_time}"
        )
+
+        self.vaild_parameters(task)
        api_server_logger.debug(f"Recieve task: {task}")
        try:
            if not self.enable_mm:
@@ -144,6 +162,10 @@ class EngineClient:
            if data["max_tokens"] < 1 or data["max_tokens"] >= self.max_model_len:
                raise ValueError(f"max_tokens can be defined [1, {self.max_model_len}).")

+        if data.get("reasoning_max_tokens"):
+            if data["reasoning_max_tokens"] > data["max_tokens"] or data["reasoning_max_tokens"] < 1:
+                raise ValueError("reasoning_max_tokens must be between max_tokens and 1")
+
        if data.get("top_p"):
            if data["top_p"] > 1 or data["top_p"] < 0:
                raise ValueError(
@@ -246,4 +268,3 @@ class EngineClient:
            return False, "clear model weight timeout"
        time.sleep(1)
        return True, ""
-