mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
fix xpu offline demo garbled output (#2763)
This commit is contained in:
@@ -142,7 +142,8 @@ def xpu_process_output(
|
|||||||
|
|
||||||
|
|
||||||
def xpu_post_process(sampled_token_ids: paddle.Tensor,
|
def xpu_post_process(sampled_token_ids: paddle.Tensor,
|
||||||
model_output: ModelOutputData) -> None:
|
model_output: ModelOutputData,
|
||||||
|
skip_save_output: bool) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@@ -185,6 +186,7 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor,
|
|||||||
)
|
)
|
||||||
# 3. Transmit the model's output and stop generation signal via message queue.
|
# 3. Transmit the model's output and stop generation signal via message queue.
|
||||||
# In the future, we will abandon this approach.
|
# In the future, we will abandon this approach.
|
||||||
|
if not skip_save_output:
|
||||||
save_output(
|
save_output(
|
||||||
sampled_token_ids,
|
sampled_token_ids,
|
||||||
model_output.not_need_stop,
|
model_output.not_need_stop,
|
||||||
@@ -658,7 +660,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self._dummy_prefill_inputs(num_tokens, batch_size)
|
self._dummy_prefill_inputs(num_tokens, batch_size)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
self.execute_model(None)
|
self.execute_model(None, True)
|
||||||
|
|
||||||
if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0:
|
if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0:
|
||||||
break
|
break
|
||||||
@@ -666,6 +668,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
def execute_model(
|
def execute_model(
|
||||||
self,
|
self,
|
||||||
model_forward_batch: Optional[List[Request]] = None,
|
model_forward_batch: Optional[List[Request]] = None,
|
||||||
|
is_dummy_run: bool = False,
|
||||||
) -> Optional[ModelRunnerOutput]:
|
) -> Optional[ModelRunnerOutput]:
|
||||||
"""
|
"""
|
||||||
The Entrance of model execute.
|
The Entrance of model execute.
|
||||||
@@ -721,7 +724,8 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
accept_num=None,
|
accept_num=None,
|
||||||
)
|
)
|
||||||
xpu_post_process(sampled_token_ids=sampled_token_ids,
|
xpu_post_process(sampled_token_ids=sampled_token_ids,
|
||||||
model_output=model_output_data)
|
model_output=model_output_data,
|
||||||
|
skip_save_output=is_dummy_run)
|
||||||
|
|
||||||
# 7. Updata 'infer_seed' and step_paddle()
|
# 7. Updata 'infer_seed' and step_paddle()
|
||||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||||
|
@@ -86,7 +86,6 @@ class XpuWorker(WorkerBase):
|
|||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
by adjusting the `gpu_memory_utilization` parameter.
|
by adjusting the `gpu_memory_utilization` parameter.
|
||||||
"""
|
"""
|
||||||
# logger.warn("XPU current could not determine available memory")
|
|
||||||
from fastdeploy.model_executor.ops.xpu import \
|
from fastdeploy.model_executor.ops.xpu import \
|
||||||
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
|
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user