diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 843480674..93e33c2bf 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -210,9 +210,6 @@ class LLMEngine: request.get("max_tokens"), ), ) - if request.get("reasoning_max_tokens") is None: - default_reasoning_max_tokens = max(int(request.get("max_tokens") * 0.8), 1) - request.set("reasoning_max_tokens", default_reasoning_max_tokens) min_tokens = request.get("min_tokens") if input_ids_len + min_tokens >= self.cfg.max_model_len: error_msg = ( diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py index 423434857..781580f7d 100644 --- a/fastdeploy/engine/sampling_params.py +++ b/fastdeploy/engine/sampling_params.py @@ -159,8 +159,6 @@ class SamplingParams: def __post_init__(self): if self.seed is None: self.seed = random.randint(0, 922337203685477580) - if self.max_tokens is not None and self.reasoning_max_tokens is None: - self.reasoning_max_tokens = max(int(self.max_tokens * 0.8), 1) self._verify_args() def _verify_args(self) -> None: diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index ce5187e3f..8e1d2e5e9 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -255,6 +255,10 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor): request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) + else: + request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"]) + if request.get("reasoning_max_tokens") is None: + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) data_processor_logger.info(f"Processed request {request}") return request