diff --git a/fastdeploy/model_executor/graph_optimization/decorator.py b/fastdeploy/model_executor/graph_optimization/decorator.py index 2937579b0..562164aae 100644 --- a/fastdeploy/model_executor/graph_optimization/decorator.py +++ b/fastdeploy/model_executor/graph_optimization/decorator.py @@ -114,14 +114,14 @@ def cuda_graph_buffers(buffer_meta): cur = getattr(cur, p) return cur - if not hasattr(self, "_mm_buffers"): - self._mm_buffers = {} + if not hasattr(self, "_cuda_graph_buffers"): + self._cuda_graph_buffers = {} for name, meta in buffer_meta.items(): shape = [_resolve_path(fd_config, s) if isinstance(s, str) else s for s in meta["shape"]] dtype = meta["dtype"] if "." in meta["dtype"]: dtype = _resolve_path(fd_config, meta["dtype"]) - self._mm_buffers[name] = paddle.full( + self._cuda_graph_buffers[name] = paddle.full( shape=shape, dtype=dtype, fill_value=meta.get("value", 0), diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 875d97cdd..a36f9d4db 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -506,17 +506,17 @@ class Ernie4_5_VLModel(nn.Layer): text_token_num = paddle.maximum((token_num - image_token_num), paddle.ones([], dtype="int64")) # The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size. - self._mm_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1) - self._mm_buffers["token_type_ids"].copy_(token_type_ids, False) - self._mm_buffers["image_token_num"].copy_(image_token_num, False) + self._cuda_graph_buffers["token_type_ids"][: self.fd_config.graph_opt_config.max_capture_size].fill_(-1) + self._cuda_graph_buffers["token_type_ids"].copy_(token_type_ids, False) + self._cuda_graph_buffers["image_token_num"].copy_(image_token_num, False) return VLMoEMeta( - text_input=self._mm_buffers["text_input"][:text_token_num], - image_input=self._mm_buffers["image_input"][:image_token_num], - text_index=self._mm_buffers["text_index"][:token_num], - image_index=self._mm_buffers["image_index"][:token_num], - token_type_ids=self._mm_buffers["token_type_ids"][:token_num], - image_token_num=self._mm_buffers["image_token_num"], + text_input=self._cuda_graph_buffers["text_input"][:text_token_num], + image_input=self._cuda_graph_buffers["image_input"][:image_token_num], + text_index=self._cuda_graph_buffers["text_index"][:token_num], + image_index=self._cuda_graph_buffers["image_index"][:token_num], + token_type_ids=self._cuda_graph_buffers["token_type_ids"][:token_num], + image_token_num=self._cuda_graph_buffers["image_token_num"], ) def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor: @@ -756,10 +756,11 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): def get_input_embeddings( self, ids_remove_padding: paddle.Tensor, + image_token_num: int, image_features: Optional[paddle.Tensor] = None, ) -> paddle.Tensor: input_embeddings = self.ernie.get_input_embeddings(ids_remove_padding=ids_remove_padding) - if image_features is not None and len(image_features) > 0: + if image_token_num > 0: input_embeddings[ids_remove_padding == self.ernie.im_patch_id] = image_features.cast(self.ernie._dtype) return input_embeddings @@ -769,11 +770,13 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): image_features: Optional[paddle.Tensor], forward_meta: ForwardMeta, ): + vl_moe_meta = self.ernie.prepare_vl_moe_meta(ids_remove_padding=ids_remove_padding) input_embeddings = self.get_input_embeddings( - ids_remove_padding=ids_remove_padding, image_features=image_features + ids_remove_padding=ids_remove_padding, + image_features=image_features, + image_token_num=vl_moe_meta.image_token_num.item(), ) self._input_embeddings.copy_(input_embeddings, False) - vl_moe_meta = self.ernie.prepare_vl_moe_meta(ids_remove_padding=ids_remove_padding) hidden_states = self.ernie( input_embeddings=self._input_embeddings,