diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 702b05d50..875d97cdd 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -49,7 +49,6 @@ from fastdeploy.platforms import current_platform if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import ( - extract_text_token_output, text_image_gather_scatter, text_image_index_out, ) @@ -544,17 +543,6 @@ class Ernie4_5_VLModel(nn.Layer): ) hidden_states = hidden_states + residual - - max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1) - hidden_states = extract_text_token_output( - max_seq_len, - max_seq_len_index.cast("int32"), - vl_moe_meta.image_token_num.cast("int32"), - forward_meta.seq_lens_this_time, - forward_meta.cu_seqlens_q, - hidden_states.cast("float32"), - ).cast(self._dtype) - out = self.norm(hidden_states) return out diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9a7e2bdf5..690740175 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1298,24 +1298,23 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["image_features"], self.forward_meta, ) - hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - ( - self.share_inputs["output_padding_offset"] if self.speculative_decoding else None - ), # speculative decoding requires - self.parallel_config.max_model_len, - ) + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + ( + self.share_inputs["output_padding_offset"] if self.speculative_decoding else None + ), # speculative decoding requires + self.parallel_config.max_model_len, + ) # 4. Execute spec decode logits = self.model.compute_logits(hidden_states) @@ -1608,21 +1607,20 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["image_features"], self.forward_meta, ) - hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], forward_meta=self.forward_meta, ) - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.parallel_config.max_model_len, - ) + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), + self.parallel_config.max_model_len, + ) # 4. Compute logits, Sample logits = self.model.compute_logits(hidden_states)