mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Others] clean code (#5235)
This commit is contained in:
@@ -646,7 +646,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
|
||||
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
|
||||
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length
|
||||
|
||||
if request.get("seed") is not None:
|
||||
self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed")
|
||||
@@ -864,7 +863,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
||||
|
||||
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
|
||||
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length
|
||||
|
||||
if request.get("seed") is not None:
|
||||
self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed")
|
||||
@@ -1017,7 +1015,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
||||
self.share_inputs["temperature"][idx : idx + 1] = 1
|
||||
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
|
||||
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length
|
||||
|
||||
self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(
|
||||
@@ -1101,7 +1098,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
|
||||
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu()
|
||||
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
||||
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32")
|
||||
|
||||
@@ -1783,8 +1779,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
def _dummy_run(
|
||||
self,
|
||||
num_tokens: paddle.Tensor,
|
||||
batch_size: paddle.Tensor,
|
||||
num_tokens: int,
|
||||
batch_size: int,
|
||||
expected_decode_len: int = 1,
|
||||
in_capturing: bool = False,
|
||||
capture_prefill: bool = False,
|
||||
@@ -1830,21 +1826,20 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# 3. Run model
|
||||
if self.enable_mm:
|
||||
model_output = self.model(
|
||||
self.share_inputs["ids_remove_padding"],
|
||||
self.forward_meta.ids_remove_padding,
|
||||
self.share_inputs["image_features"],
|
||||
self.forward_meta,
|
||||
)
|
||||
else:
|
||||
model_output = self.model(
|
||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||
forward_meta=self.forward_meta,
|
||||
self.forward_meta.ids_remove_padding,
|
||||
self.forward_meta,
|
||||
)
|
||||
if self.use_cudagraph:
|
||||
model_output = model_output[: self.real_token_num]
|
||||
|
||||
if self.is_pooling_model:
|
||||
hidden_states = model_output
|
||||
self._dummy_pooler_run(hidden_states, model_output)
|
||||
self._dummy_pooler_run(model_output, model_output)
|
||||
break
|
||||
else:
|
||||
hidden_states = rebuild_padding(
|
||||
@@ -2154,14 +2149,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# 3. Execute model
|
||||
if self.enable_mm:
|
||||
model_output = self.model(
|
||||
self.share_inputs["ids_remove_padding"],
|
||||
self.forward_meta.ids_remove_padding,
|
||||
self.share_inputs["image_features"],
|
||||
self.forward_meta,
|
||||
)
|
||||
else:
|
||||
model_output = self.model(
|
||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||
forward_meta=self.forward_meta,
|
||||
self.forward_meta.ids_remove_padding,
|
||||
self.forward_meta,
|
||||
)
|
||||
if self.use_cudagraph:
|
||||
model_output = model_output[: self.real_token_num]
|
||||
@@ -2169,8 +2164,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
prompt_logprobs_list = self._get_prompt_logprobs_list(model_output)
|
||||
|
||||
if self.is_pooling_model:
|
||||
hidden_states = model_output
|
||||
pooler_output = self._pool(hidden_states, num_running_requests)
|
||||
pooler_output = self._pool(model_output, num_running_requests)
|
||||
|
||||
model_output_data = ModelOutputData(
|
||||
next_tokens=self.share_inputs["next_tokens"],
|
||||
|
||||
Reference in New Issue
Block a user