[BugFix] mm_post_fix (#4005)

* mm_post_fix

* mm_post_fix_1
This commit is contained in:
xiaoxiaohehe001
2025-09-11 19:09:46 +08:00
committed by GitHub
parent d2ec7f6aa2
commit abdcef30aa
2 changed files with 20 additions and 34 deletions

View File

@@ -49,7 +49,6 @@ from fastdeploy.platforms import current_platform
if current_platform.is_cuda(): if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import ( from fastdeploy.model_executor.ops.gpu import (
extract_text_token_output,
text_image_gather_scatter, text_image_gather_scatter,
text_image_index_out, text_image_index_out,
) )
@@ -544,17 +543,6 @@ class Ernie4_5_VLModel(nn.Layer):
) )
hidden_states = hidden_states + residual hidden_states = hidden_states + residual
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
hidden_states = extract_text_token_output(
max_seq_len,
max_seq_len_index.cast("int32"),
vl_moe_meta.image_token_num.cast("int32"),
forward_meta.seq_lens_this_time,
forward_meta.cu_seqlens_q,
hidden_states.cast("float32"),
).cast(self._dtype)
out = self.norm(hidden_states) out = self.norm(hidden_states)
return out return out

View File

@@ -1298,24 +1298,23 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["image_features"], self.share_inputs["image_features"],
self.forward_meta, self.forward_meta,
) )
hidden_states = model_output
else: else:
model_output = self.model( model_output = self.model(
ids_remove_padding=self.share_inputs["ids_remove_padding"], ids_remove_padding=self.share_inputs["ids_remove_padding"],
forward_meta=self.forward_meta, forward_meta=self.forward_meta,
) )
hidden_states = rebuild_padding( hidden_states = rebuild_padding(
model_output, model_output,
self.share_inputs["cu_seqlens_q"], self.share_inputs["cu_seqlens_q"],
self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_decoder"],
self.share_inputs["seq_lens_encoder"], self.share_inputs["seq_lens_encoder"],
( (
self.share_inputs["output_padding_offset"] if self.speculative_decoding else None self.share_inputs["output_padding_offset"] if self.speculative_decoding else None
), # speculative decoding requires ), # speculative decoding requires
self.parallel_config.max_model_len, self.parallel_config.max_model_len,
) )
# 4. Execute spec decode # 4. Execute spec decode
logits = self.model.compute_logits(hidden_states) logits = self.model.compute_logits(hidden_states)
@@ -1608,21 +1607,20 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["image_features"], self.share_inputs["image_features"],
self.forward_meta, self.forward_meta,
) )
hidden_states = model_output
else: else:
model_output = self.model( model_output = self.model(
ids_remove_padding=self.share_inputs["ids_remove_padding"], ids_remove_padding=self.share_inputs["ids_remove_padding"],
forward_meta=self.forward_meta, forward_meta=self.forward_meta,
) )
hidden_states = rebuild_padding( hidden_states = rebuild_padding(
model_output, model_output,
self.share_inputs["cu_seqlens_q"], self.share_inputs["cu_seqlens_q"],
self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_decoder"],
self.share_inputs["seq_lens_encoder"], self.share_inputs["seq_lens_encoder"],
(self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None),
self.parallel_config.max_model_len, self.parallel_config.max_model_len,
) )
# 4. Compute logits, Sample # 4. Compute logits, Sample
logits = self.model.compute_logits(hidden_states) logits = self.model.compute_logits(hidden_states)