[BugFix] fix video bug (#5557)

* fix video bug

* add eb5 moe model
This commit is contained in:
kevin
2025-12-16 20:06:50 +08:00
committed by GitHub
parent 27ef3610b5
commit 7140939c51
2 changed files with 5 additions and 3 deletions

View File

@@ -245,7 +245,7 @@ class ModelConfig:
"""
check if the model architecture disable for mm prefill
"""
return self._architecture in ["Ernie5ForCausalLM"]
return self._architecture in ["Ernie5ForCausalLM", "Ernie5MoeForCausalLM"]
def _post_init(self):
self.is_unified_ckpt = check_unified_ckpt(self.model)

View File

@@ -531,7 +531,8 @@ class GPUModelRunner(ModelRunnerBase):
assert (
image_features_output is not None
), f"image_features_output is None, images_lst length: {len(multi_vision_inputs['images_lst'])}"
mm_token_lenght = paddle.prod(multi_vision_inputs["grid_thw_lst"][thw_idx]) // 4
grid_thw = multi_vision_inputs["grid_thw_lst"][thw_idx]
mm_token_lenght = (grid_thw[1] * grid_thw[2]) // 4
mm_feature = image_features_output[feature_idx : feature_idx + mm_token_lenght]
# add feature to encoder cache
@@ -555,7 +556,8 @@ class GPUModelRunner(ModelRunnerBase):
merge_image_features, feature_idx, thw_idx = [], 0, 0
image_features_output = self.extract_vision_features(multi_vision_inputs)
for feature_position in multi_vision_inputs["feature_position_list"]:
mm_token_lenght = paddle.prod(multi_vision_inputs["grid_thw_lst"][thw_idx]) // 4
grid_thw = multi_vision_inputs["grid_thw_lst"][thw_idx]
mm_token_lenght = (grid_thw[1] * grid_thw[2]) // 4
mm_feature = image_features_output[feature_idx : feature_idx + mm_token_lenght]
feature_start = feature_position.offset