mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 18:11:00 +08:00
[xpu] support ep (#4067)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
This commit is contained in:
@@ -872,16 +872,36 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
self._dummy_prefill_inputs(num_tokens, batch_size)
|
||||
|
||||
while True:
|
||||
self.execute_model(None, True)
|
||||
self.execute_model(is_dummy_run=True)
|
||||
|
||||
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
|
||||
break
|
||||
|
||||
def _set_debug_level(
|
||||
self, debug_level: int = 0x1, model_forward_batch: Optional[List[Request]] = None, is_dummy_run: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Set debug level for XPU: 0x1, 0xA1, 0x1B1
|
||||
"""
|
||||
request_num = 0 if model_forward_batch is None else len(model_forward_batch)
|
||||
if debug_level == 0 or request_num == 0 or is_dummy_run:
|
||||
paddle.device.xpu.set_debug_level(0)
|
||||
return
|
||||
|
||||
if self.parallel_config.use_ep:
|
||||
request_num = paddle.to_tensor(request_num, dtype="int32")
|
||||
paddle.distributed.all_reduce(request_num, group=self.parallel_config.ep_group)
|
||||
logger.info(f"local_rank: {self.local_rank}, request_num: {request_num.item()}")
|
||||
if request_num.item() > 0:
|
||||
paddle.device.xpu.set_debug_level(debug_level)
|
||||
else:
|
||||
paddle.device.xpu.set_debug_level(debug_level)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
model_forward_batch: Optional[List[Request]] = None,
|
||||
is_dummy_run: bool = False,
|
||||
num_running_requests: int = None,
|
||||
is_dummy_run: bool = False,
|
||||
) -> Optional[ModelRunnerOutput]:
|
||||
"""
|
||||
The Entrance of model execute.
|
||||
@@ -892,6 +912,9 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
num_running_requests: batch_size
|
||||
intermediate_tensors:
|
||||
"""
|
||||
# 0. set debug level
|
||||
# self._set_debug_level(0x1, model_forward_batch, is_dummy_run)
|
||||
|
||||
# 1. Prepare inputs of model and decoder.
|
||||
self._prepare_inputs(is_dummy_run=is_dummy_run)
|
||||
|
||||
|
Reference in New Issue
Block a user