[Cherry-Pick][BugFix][CI] fix vl moe(#4867) (#4869)

* [CI] update paddlepaddle_gpu==3.2.1 and fix rollout_model test logic * [Cherry-Pick][BugFix][CI] fix vl moe(#4867)
2025-12-24 13:28:13 +08:00 · 2025-11-07 00:03:36 +08:00
parent 89934edc10
commit 71bbedaf50
3 changed files with 10 additions and 6 deletions
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -72,6 +72,7 @@ class VLMoEMeta:
    image_index: paddle.Tensor
    token_type_ids: paddle.Tensor
    image_token_num: paddle.Tensor
+    num_image_patch_id: paddle.Tensor

    def __str__(self):
        return (
@@ -499,11 +500,13 @@ class Ernie4_5_VLModel(nn.Layer):
        ids_remove_padding: paddle.Tensor,
    ) -> VLMoEMeta:

-        image_mask = ids_remove_padding == self.im_patch_id
+        image_mask = ids_remove_padding >= self.im_patch_id
        token_type_ids = image_mask.cast("int32")
        image_token_num = image_mask.sum()
        token_num = ids_remove_padding.shape[0]
        text_token_num = paddle.maximum((token_num - image_token_num), paddle.ones([], dtype="int64"))
+        num_image_patch_id = ids_remove_padding == self.im_patch_id
+        num_image_patch_id = num_image_patch_id.cast("int32").sum()

        # The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size.
        self._cuda_graph_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1)
@@ -517,6 +520,7 @@ class Ernie4_5_VLModel(nn.Layer):
            image_index=self._cuda_graph_buffers["image_index"][:token_num],
            token_type_ids=self._cuda_graph_buffers["token_type_ids"][:token_num],
            image_token_num=self._cuda_graph_buffers["image_token_num"],
+            num_image_patch_id=num_image_patch_id,
        )

    def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
@@ -787,7 +791,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
        input_embeddings = self.get_input_embeddings(
            ids_remove_padding=ids_remove_padding,
            image_features=image_features,
-            image_token_num=vl_moe_meta.image_token_num.item(),
+            image_token_num=vl_moe_meta.num_image_patch_id.item(),
        )
        self._input_embeddings.copy_(input_embeddings, False)

--- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -238,9 +238,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
    # base result
    base_path = os.getenv("MODEL_PATH")
    if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-v2.3")
    else:
-        base_file = "ernie-4_5-vl-base-tp2"
+        base_file = "ernie-4_5-vl-base-tp2-v2.3"
    with open(base_file, "r") as f:
        content2 = f.read()

--- a/tests/e2e/test_EB_VL_Lite_serving.py
+++ b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -247,9 +247,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
    # base result
    base_path = os.getenv("MODEL_PATH")
    if base_path:
-        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2")
+        base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-v2.3")
    else:
-        base_file = "ernie-4_5-vl-base-tp2"
+        base_file = "ernie-4_5-vl-base-tp2-v2.3"
    with open(base_file, "r") as f:
        content2 = f.read()