From 71bbedaf50f26aaf3b43756ea1d930ceec8c2bb6 Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Fri, 7 Nov 2025 00:03:36 +0800 Subject: [PATCH] [Cherry-Pick][BugFix][CI] fix vl moe(#4867) (#4869) * [CI] update paddlepaddle_gpu==3.2.1 and fix rollout_model test logic * [Cherry-Pick][BugFix][CI] fix vl moe(#4867) --- .../model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py | 8 ++++++-- tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py | 4 ++-- tests/e2e/test_EB_VL_Lite_serving.py | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index d6b11f0cd..5b8e4e13d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -72,6 +72,7 @@ class VLMoEMeta: image_index: paddle.Tensor token_type_ids: paddle.Tensor image_token_num: paddle.Tensor + num_image_patch_id: paddle.Tensor def __str__(self): return ( @@ -499,11 +500,13 @@ class Ernie4_5_VLModel(nn.Layer): ids_remove_padding: paddle.Tensor, ) -> VLMoEMeta: - image_mask = ids_remove_padding == self.im_patch_id + image_mask = ids_remove_padding >= self.im_patch_id token_type_ids = image_mask.cast("int32") image_token_num = image_mask.sum() token_num = ids_remove_padding.shape[0] text_token_num = paddle.maximum((token_num - image_token_num), paddle.ones([], dtype="int64")) + num_image_patch_id = ids_remove_padding == self.im_patch_id + num_image_patch_id = num_image_patch_id.cast("int32").sum() # The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size. self._cuda_graph_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1) @@ -517,6 +520,7 @@ class Ernie4_5_VLModel(nn.Layer): image_index=self._cuda_graph_buffers["image_index"][:token_num], token_type_ids=self._cuda_graph_buffers["token_type_ids"][:token_num], image_token_num=self._cuda_graph_buffers["image_token_num"], + num_image_patch_id=num_image_patch_id, ) def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor: @@ -787,7 +791,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): input_embeddings = self.get_input_embeddings( ids_remove_padding=ids_remove_padding, image_features=image_features, - image_token_num=vl_moe_meta.image_token_num.item(), + image_token_num=vl_moe_meta.num_image_patch_id.item(), ) self._input_embeddings.copy_(input_embeddings, False) diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 88cbd6d5a..6583674af 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -238,9 +238,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-v2.3") else: - base_file = "ernie-4_5-vl-base-tp2" + base_file = "ernie-4_5-vl-base-tp2-v2.3" with open(base_file, "r") as f: content2 = f.read() diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index d802fcdd2..62137d154 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -247,9 +247,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # base result base_path = os.getenv("MODEL_PATH") if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2") + base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-v2.3") else: - base_file = "ernie-4_5-vl-base-tp2" + base_file = "ernie-4_5-vl-base-tp2-v2.3" with open(base_file, "r") as f: content2 = f.read()