fix xpu offline demo garbled output (#2763)

This commit is contained in:
yulangz
2025-07-09 14:51:20 +08:00
committed by GitHub
parent fee544e808
commit 0350831c2b
2 changed files with 13 additions and 10 deletions

View File

@@ -142,7 +142,8 @@ def xpu_process_output(
def xpu_post_process(sampled_token_ids: paddle.Tensor, def xpu_post_process(sampled_token_ids: paddle.Tensor,
model_output: ModelOutputData) -> None: model_output: ModelOutputData,
skip_save_output: bool) -> None:
""" """
""" """
@@ -185,12 +186,13 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
) )
# 3. Transmit the model's output and stop generation signal via message queue. # 3. Transmit the model's output and stop generation signal via message queue.
# In the future, we will abandon this approach. # In the future, we will abandon this approach.
save_output( if not skip_save_output:
sampled_token_ids, save_output(
model_output.not_need_stop, sampled_token_ids,
model_output.mp_rank, model_output.not_need_stop,
False, # use_ep model_output.mp_rank,
) False, # use_ep
)
def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int, def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int,
@@ -658,7 +660,7 @@ class XPUModelRunner(ModelRunnerBase):
self._dummy_prefill_inputs(num_tokens, batch_size) self._dummy_prefill_inputs(num_tokens, batch_size)
while True: while True:
self.execute_model(None) self.execute_model(None, True)
if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0: if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0:
break break
@@ -666,6 +668,7 @@ class XPUModelRunner(ModelRunnerBase):
def execute_model( def execute_model(
self, self,
model_forward_batch: Optional[List[Request]] = None, model_forward_batch: Optional[List[Request]] = None,
is_dummy_run: bool = False,
) -> Optional[ModelRunnerOutput]: ) -> Optional[ModelRunnerOutput]:
""" """
The Entrance of model execute. The Entrance of model execute.
@@ -721,7 +724,8 @@ class XPUModelRunner(ModelRunnerBase):
accept_num=None, accept_num=None,
) )
xpu_post_process(sampled_token_ids=sampled_token_ids, xpu_post_process(sampled_token_ids=sampled_token_ids,
model_output=model_output_data) model_output=model_output_data,
skip_save_output=is_dummy_run)
# 7. Updata 'infer_seed' and step_paddle() # 7. Updata 'infer_seed' and step_paddle()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"].add_(self.infer_seed_increment)

View File

@@ -86,7 +86,6 @@ class XpuWorker(WorkerBase):
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
""" """
# logger.warn("XPU current could not determine available memory")
from fastdeploy.model_executor.ops.xpu import \ from fastdeploy.model_executor.ops.xpu import \
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory