From b2afdf4fc6d3dfa2ebf746eaa0e9b85a130dda4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=B3=B3=E6=A1=A6?= <39643373+liyonghua0910@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:16:23 +0800 Subject: [PATCH] [fix] qwen output inconsistency when top_p=0 (#3634) * [fix] qwen output inconsistency when top_p=0 * [fix] remove decode pre_id code --- fastdeploy/worker/gpu_model_runner.py | 1 + fastdeploy/worker/metax_model_runner.py | 1 + fastdeploy/worker/xpu_model_runner.py | 1 + 3 files changed, 3 insertions(+) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index cb4b8809c..291388aa6 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -288,6 +288,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["step_idx"][idx : idx + 1] = ( len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0 ) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 has_prefill_task = True elif request.task_type.value == RequestType.DECODE.value: # decode task logger.debug(f"Handle decode request {request} at idx {idx}") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 61f76876f..d9b28a739 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -276,6 +276,7 @@ class MetaxModelRunner(ModelRunnerBase): self.share_inputs["step_idx"][idx : idx + 1] = ( len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0 ) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 has_prefill_task = True elif request.task_type.value == RequestType.DECODE.value: # decode task logger.debug(f"Handle decode request {request} at idx {idx}") diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 55ac4beb5..e933c00f2 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -411,6 +411,7 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["step_idx"][idx : idx + 1] = ( len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0 ) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 has_prefill_task = True elif request.task_type.value == RequestType.DECODE.value: # decode task logger.debug(f"Handle decode request {request} at idx {idx}")