mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[fix] qwen output inconsistency when top_p=0 (#3634)
* [fix] qwen output inconsistency when top_p=0 * [fix] remove decode pre_id code
This commit is contained in:
@@ -288,6 +288,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_idx"][idx : idx + 1] = (
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
)
|
||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
has_prefill_task = True
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
logger.debug(f"Handle decode request {request} at idx {idx}")
|
||||
|
@@ -276,6 +276,7 @@ class MetaxModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_idx"][idx : idx + 1] = (
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
)
|
||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
has_prefill_task = True
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
logger.debug(f"Handle decode request {request} at idx {idx}")
|
||||
|
@@ -411,6 +411,7 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_idx"][idx : idx + 1] = (
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
)
|
||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
has_prefill_task = True
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
logger.debug(f"Handle decode request {request} at idx {idx}")
|
||||
|
Reference in New Issue
Block a user