[BugFix] fix video bug (#5557)

* fix video bug * add eb5 moe model
2025-12-24 13:28:13 +08:00 · 2025-12-16 20:06:50 +08:00
parent 27ef3610b5
commit 7140939c51
2 changed files with 5 additions and 3 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -245,7 +245,7 @@ class ModelConfig:
        """
        check if the model architecture disable for mm prefill
        """
-        return self._architecture in ["Ernie5ForCausalLM"]
+        return self._architecture in ["Ernie5ForCausalLM", "Ernie5MoeForCausalLM"]

    def _post_init(self):
        self.is_unified_ckpt = check_unified_ckpt(self.model)
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -531,7 +531,8 @@ class GPUModelRunner(ModelRunnerBase):
                        assert (
                            image_features_output is not None
                        ), f"image_features_output is None, images_lst length: {len(multi_vision_inputs['images_lst'])}"
-                        mm_token_lenght = paddle.prod(multi_vision_inputs["grid_thw_lst"][thw_idx]) // 4
+                        grid_thw = multi_vision_inputs["grid_thw_lst"][thw_idx]
+                        mm_token_lenght = (grid_thw[1] * grid_thw[2]) // 4
                        mm_feature = image_features_output[feature_idx : feature_idx + mm_token_lenght]

                        # add feature to encoder cache
@@ -555,7 +556,8 @@ class GPUModelRunner(ModelRunnerBase):
            merge_image_features, feature_idx, thw_idx = [], 0, 0
            image_features_output = self.extract_vision_features(multi_vision_inputs)
            for feature_position in multi_vision_inputs["feature_position_list"]:
-                mm_token_lenght = paddle.prod(multi_vision_inputs["grid_thw_lst"][thw_idx]) // 4
+                grid_thw = multi_vision_inputs["grid_thw_lst"][thw_idx]
+                mm_token_lenght = (grid_thw[1] * grid_thw[2]) // 4
                mm_feature = image_features_output[feature_idx : feature_idx + mm_token_lenght]

                feature_start = feature_position.offset