[fix] qwen output inconsistency when top_p=0 (#3634) (#3662)

* [fix] qwen output inconsistency when top_p=0

* [fix] remove decode pre_id code
This commit is contained in:
李泳桦
2025-08-28 09:54:17 +08:00
committed by GitHub
parent 6a90cfd144
commit 6545994c58
2 changed files with 2 additions and 0 deletions

View File

@@ -283,6 +283,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["step_idx"][idx : idx + 1] = (
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
)
self.share_inputs["pre_ids"][idx : idx + 1] = -1
has_prefill_task = True
elif request.task_type.value == RequestType.DECODE.value: # decode task
logger.debug(f"Handle decode request {request} at idx {idx}")