mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
@@ -49,7 +49,6 @@ from fastdeploy.platforms import current_platform
|
|||||||
|
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
from fastdeploy.model_executor.ops.gpu import (
|
from fastdeploy.model_executor.ops.gpu import (
|
||||||
extract_text_token_output,
|
|
||||||
text_image_gather_scatter,
|
text_image_gather_scatter,
|
||||||
text_image_index_out,
|
text_image_index_out,
|
||||||
)
|
)
|
||||||
@@ -544,17 +543,6 @@ class Ernie4_5_VLModel(nn.Layer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = hidden_states + residual
|
hidden_states = hidden_states + residual
|
||||||
|
|
||||||
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
|
|
||||||
hidden_states = extract_text_token_output(
|
|
||||||
max_seq_len,
|
|
||||||
max_seq_len_index.cast("int32"),
|
|
||||||
vl_moe_meta.image_token_num.cast("int32"),
|
|
||||||
forward_meta.seq_lens_this_time,
|
|
||||||
forward_meta.cu_seqlens_q,
|
|
||||||
hidden_states.cast("float32"),
|
|
||||||
).cast(self._dtype)
|
|
||||||
|
|
||||||
out = self.norm(hidden_states)
|
out = self.norm(hidden_states)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
@@ -1298,24 +1298,23 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["image_features"],
|
self.share_inputs["image_features"],
|
||||||
self.forward_meta,
|
self.forward_meta,
|
||||||
)
|
)
|
||||||
hidden_states = model_output
|
|
||||||
else:
|
else:
|
||||||
model_output = self.model(
|
model_output = self.model(
|
||||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||||
forward_meta=self.forward_meta,
|
forward_meta=self.forward_meta,
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = rebuild_padding(
|
hidden_states = rebuild_padding(
|
||||||
model_output,
|
model_output,
|
||||||
self.share_inputs["cu_seqlens_q"],
|
self.share_inputs["cu_seqlens_q"],
|
||||||
self.share_inputs["seq_lens_this_time"],
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.share_inputs["seq_lens_decoder"],
|
self.share_inputs["seq_lens_decoder"],
|
||||||
self.share_inputs["seq_lens_encoder"],
|
self.share_inputs["seq_lens_encoder"],
|
||||||
(
|
(
|
||||||
self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
|
self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
|
||||||
), # speculative decoding requires
|
), # speculative decoding requires
|
||||||
self.parallel_config.max_model_len,
|
self.parallel_config.max_model_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. Execute spec decode
|
# 4. Execute spec decode
|
||||||
logits = self.model.compute_logits(hidden_states)
|
logits = self.model.compute_logits(hidden_states)
|
||||||
@@ -1608,21 +1607,20 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["image_features"],
|
self.share_inputs["image_features"],
|
||||||
self.forward_meta,
|
self.forward_meta,
|
||||||
)
|
)
|
||||||
hidden_states = model_output
|
|
||||||
else:
|
else:
|
||||||
model_output = self.model(
|
model_output = self.model(
|
||||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||||
forward_meta=self.forward_meta,
|
forward_meta=self.forward_meta,
|
||||||
)
|
)
|
||||||
hidden_states = rebuild_padding(
|
hidden_states = rebuild_padding(
|
||||||
model_output,
|
model_output,
|
||||||
self.share_inputs["cu_seqlens_q"],
|
self.share_inputs["cu_seqlens_q"],
|
||||||
self.share_inputs["seq_lens_this_time"],
|
self.share_inputs["seq_lens_this_time"],
|
||||||
self.share_inputs["seq_lens_decoder"],
|
self.share_inputs["seq_lens_decoder"],
|
||||||
self.share_inputs["seq_lens_encoder"],
|
self.share_inputs["seq_lens_encoder"],
|
||||||
(self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
|
(self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
|
||||||
self.parallel_config.max_model_len,
|
self.parallel_config.max_model_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. Compute logits, Sample
|
# 4. Compute logits, Sample
|
||||||
logits = self.model.compute_logits(hidden_states)
|
logits = self.model.compute_logits(hidden_states)
|
||||||
|
Reference in New Issue
Block a user