[Feature] Support Paddle-OCR (#4396)

* init * update code * fix code style & disable thinking * adapt for common_engine.update_mm_requests_chunk_size * use 3d rope * use flash_attn_unpadded * opt siglip * update to be compatible with the latest codebase * fix typo * optim OCR performance * fix bug * fix bug * fix bug * fix bug * normlize name * modify xpu rope * revert logger * fix bug * fix bug * fix bug * support default_v1 * optim performance * fix bug --------- Co-authored-by: root <root@szzj-acg-tge1-fdda9.szzj.baidu.com> Co-authored-by: zhangyue66 <zhangyue66@baidu.com>
2025-12-24 13:28:13 +08:00 · 2025-10-24 23:34:30 +08:00
parent 822dea8d5f
commit e4e3cede7f
21 changed files with 2869 additions and 175 deletions
--- a/fastdeploy/model_executor/models/paddleocr_vl/projector.py
+++ b/fastdeploy/model_executor/models/paddleocr_vl/projector.py
@@ -0,0 +1,107 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from fastdeploy.model_executor.layers.utils import get_tensor
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input):
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input):
+        return self.act(input)
+
+
+class Projector(nn.Layer):
+
+    def __init__(self, text_config, vision_config, prefix=""):
+        super().__init__()
+        self.prefix_name = prefix
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = self.vision_config.hidden_size * self.merge_kernel_size[0] * self.merge_kernel_size[1]
+
+        self.pre_norm = nn.LayerNorm(self.vision_config.hidden_size, epsilon=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size, self.text_config.hidden_size)
+
+    def forward(self, image_features, image_grid_thw):
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)  # shape: (T*H*W, D)
+                t, h, w = image_grid
+                from einops import rearrange
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=int(t),
+                    h=int(h // m1),
+                    p1=int(m1),
+                    w=int(w // m2),
+                    p2=int(m2),
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dim = image_features.shape[-1]
+        image_features = paddle.reshape(image_features, [-1, dim])
+        hidden_states = self.pre_norm(image_features)
+        hidden_states = paddle.reshape(hidden_states, [-1, self.hidden_size])
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def load_state_dict(self, state_dict):
+        params_dict = dict(self.named_parameters())
+        for param_name, param in params_dict.items():
+            state_dict_key = f"{self.prefix_name}.{param_name}"
+            if state_dict_key not in state_dict:
+                raise ValueError(f"The key {state_dict_key} does not exist in state_dict. ")
+            tensor = get_tensor(state_dict.pop(state_dict_key))
+            if param.shape != tensor.shape:
+                raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}")
+            else:
+                param.copy_(tensor, False)