[BugFix] Fix image_feature 0-Size causing insert failed (#4042)

* update

* fix image_feature
This commit is contained in:
Ayakouji
2025-09-12 19:13:08 +08:00
committed by GitHub
parent 9ac539471d
commit 987609c894
2 changed files with 18 additions and 15 deletions

View File

@@ -114,14 +114,14 @@ def cuda_graph_buffers(buffer_meta):
cur = getattr(cur, p) cur = getattr(cur, p)
return cur return cur
if not hasattr(self, "_mm_buffers"): if not hasattr(self, "_cuda_graph_buffers"):
self._mm_buffers = {} self._cuda_graph_buffers = {}
for name, meta in buffer_meta.items(): for name, meta in buffer_meta.items():
shape = [_resolve_path(fd_config, s) if isinstance(s, str) else s for s in meta["shape"]] shape = [_resolve_path(fd_config, s) if isinstance(s, str) else s for s in meta["shape"]]
dtype = meta["dtype"] dtype = meta["dtype"]
if "." in meta["dtype"]: if "." in meta["dtype"]:
dtype = _resolve_path(fd_config, meta["dtype"]) dtype = _resolve_path(fd_config, meta["dtype"])
self._mm_buffers[name] = paddle.full( self._cuda_graph_buffers[name] = paddle.full(
shape=shape, shape=shape,
dtype=dtype, dtype=dtype,
fill_value=meta.get("value", 0), fill_value=meta.get("value", 0),

View File

@@ -506,17 +506,17 @@ class Ernie4_5_VLModel(nn.Layer):
text_token_num = paddle.maximum((token_num - image_token_num), paddle.ones([], dtype="int64")) text_token_num = paddle.maximum((token_num - image_token_num), paddle.ones([], dtype="int64"))
# The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size. # The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size.
self._mm_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1) self._cuda_graph_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1)
self._mm_buffers["token_type_ids"].copy_(token_type_ids, False) self._cuda_graph_buffers["token_type_ids"].copy_(token_type_ids, False)
self._mm_buffers["image_token_num"].copy_(image_token_num, False) self._cuda_graph_buffers["image_token_num"].copy_(image_token_num, False)
return VLMoEMeta( return VLMoEMeta(
text_input=self._mm_buffers["text_input"][:text_token_num], text_input=self._cuda_graph_buffers["text_input"][:text_token_num],
image_input=self._mm_buffers["image_input"][:image_token_num], image_input=self._cuda_graph_buffers["image_input"][:image_token_num],
text_index=self._mm_buffers["text_index"][:token_num], text_index=self._cuda_graph_buffers["text_index"][:token_num],
image_index=self._mm_buffers["image_index"][:token_num], image_index=self._cuda_graph_buffers["image_index"][:token_num],
token_type_ids=self._mm_buffers["token_type_ids"][:token_num], token_type_ids=self._cuda_graph_buffers["token_type_ids"][:token_num],
image_token_num=self._mm_buffers["image_token_num"], image_token_num=self._cuda_graph_buffers["image_token_num"],
) )
def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor: def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
@@ -756,10 +756,11 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
def get_input_embeddings( def get_input_embeddings(
self, self,
ids_remove_padding: paddle.Tensor, ids_remove_padding: paddle.Tensor,
image_token_num: int,
image_features: Optional[paddle.Tensor] = None, image_features: Optional[paddle.Tensor] = None,
) -> paddle.Tensor: ) -> paddle.Tensor:
input_embeddings = self.ernie.get_input_embeddings(ids_remove_padding=ids_remove_padding) input_embeddings = self.ernie.get_input_embeddings(ids_remove_padding=ids_remove_padding)
if image_features is not None and len(image_features) > 0: if image_token_num > 0:
input_embeddings[ids_remove_padding == self.ernie.im_patch_id] = image_features.cast(self.ernie._dtype) input_embeddings[ids_remove_padding == self.ernie.im_patch_id] = image_features.cast(self.ernie._dtype)
return input_embeddings return input_embeddings
@@ -769,11 +770,13 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
image_features: Optional[paddle.Tensor], image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta, forward_meta: ForwardMeta,
): ):
vl_moe_meta = self.ernie.prepare_vl_moe_meta(ids_remove_padding=ids_remove_padding)
input_embeddings = self.get_input_embeddings( input_embeddings = self.get_input_embeddings(
ids_remove_padding=ids_remove_padding, image_features=image_features ids_remove_padding=ids_remove_padding,
image_features=image_features,
image_token_num=vl_moe_meta.image_token_num.item(),
) )
self._input_embeddings.copy_(input_embeddings, False) self._input_embeddings.copy_(input_embeddings, False)
vl_moe_meta = self.ernie.prepare_vl_moe_meta(ids_remove_padding=ids_remove_padding)
hidden_states = self.ernie( hidden_states = self.ernie(
input_embeddings=self._input_embeddings, input_embeddings=self._input_embeddings,