diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 7196bdc0b..41a82524b 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -29,6 +29,7 @@ import traceback import uuid import weakref from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict from typing import Dict, List, Optional, Tuple import numpy as np @@ -479,6 +480,8 @@ class LLMEngine: """ # TODO 输入输出长度确认 + if sampling_params is not None: + task.update(asdict(sampling_params)) request = Request.from_dict(task) llm_logger.info(f"Receive request {request}") if sampling_params is not None: diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 05f3e83dd..3220d9b0f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -260,7 +260,11 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 - input_ids = request.prompt_token_ids + request.output_token_ids + if len(request.output_token_ids) == 0: + input_ids = request.prompt_token_ids + else: + input_ids = request.prompt_token_ids + request.output_token_ids + logger.debug( f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}" )