[Sync] Update to latest code (#2679)

* [Sync] Update to latest code * Add new code files * Add new code files * update code * Try to fix build.sh * Try to fix build.sh * Update code * Update requirements.txt * Update code --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-10-04 16:22:57 +08:00 · 2025-07-03 15:43:53 +08:00
parent d222248d00
commit 05c670e593
95 changed files with 9916 additions and 1312 deletions
--- a/fastdeploy/model_executor/layers/rotary_embedding.py
+++ b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -14,13 +14,18 @@
 # limitations under the License.
 """

-from typing import Optional
+import math
+from typing import Optional, Tuple

 import paddle
+import paddle.nn as nn

 from fastdeploy.config import ModelConfig
 from fastdeploy.platforms import current_platform

+if current_platform.is_cuda():
+    from fastdeploy.model_executor.ops.gpu import fused_rotary_position_encoding
+
 from .utils import CpuGuard


@@ -99,20 +104,164 @@ class QwenRotaryEmbedding:
        return rot_emb


+def yarn_get_mscale(scale=1, mscale=1):
+    """
+    """
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def yarn_find_correction_dim(num_rotations,
+                             dim,
+                             base=10000,
+                             max_position_embeddings=2048):
+    """
+    """
+    return (dim * math.log(max_position_embeddings /
+                           (num_rotations * 2 * math.pi))) / (2 *
+                                                              math.log(base))
+
+
+def yarn_find_correction_range(low_rot,
+                               high_rot,
+                               dim,
+                               base=10000,
+                               max_position_embeddings=2048):
+    """
+    """
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def yarn_linear_ramp_mask(min, max, dim):
+    """
+    """
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (paddle.arange(dim, dtype=paddle.float32) - min) / (max -
+                                                                      min)
+    ramp_func = paddle.clip(linear_func, 0, 1)
+    return ramp_func
+
+
+class DeepseekScalingRotaryEmbedding(nn.Layer):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+
+    Args:
+    rotary_dim(int): Dimension of rotary embeddings (head dimension)
+    max_position_embeddings(int): Original training context length
+    base(float): Base value used to compute the inverse frequencies.
+    scaling_factor(float): Context extension scaling ratio (target_len / original_len)
+    extrapolation_factor(float): Weight for extrapolated frequencies (default=1)
+    attn_factor(float): Attention magnitude scaling factor (default=1)
+    beta_fast(int): High-frequency correction cutoff (default=32)
+    beta_slow(int): Low-frequency correction cutoff (default=1)
+    mscale(float): Primary magnitude scaling factor (default=1)
+    mscale_all_dim(float): Alternate magnitude scaling factor (default=0)
+
+    """
+
+    def __init__(
+        self,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        scaling_factor: float,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        super().__init__()
+        self._dtype = paddle.get_default_dtype()
+
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale)) /
+            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
+            attn_factor)
+
+        cache = self._compute_cos_sin_cache()
+
+        self.cos_sin_cache: paddle.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistable=True)
+
+    def _compute_inv_freq(self, scaling_factor: float) -> paddle.Tensor:
+        pos_freqs = self.base**(
+            paddle.arange(0, self.rotary_dim, 2, dtype=paddle.float32) /
+            self.rotary_dim)
+
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
+                                               self.rotary_dim, self.base,
+                                               self.max_position_embeddings)
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (1 - yarn_linear_ramp_mask(
+            low, high, self.rotary_dim // 2)) * self.extrapolation_factor
+        inv_freq = inv_freq_interpolation * (
+            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> paddle.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = paddle.arange(self.max_position_embeddings * self.scaling_factor,
+                          dtype=paddle.float32)
+        freqs = paddle.einsum("i,j->ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = paddle.concat((cos, sin), axis=-1)
+        return cache.cast(self._dtype)
+
+    def forward(
+        self,
+        position_ids: paddle.Tensor,
+        query: paddle.Tensor,
+        key: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        """
+        # In-place operations that update the query and key tensors.
+        fused_rotary_position_encoding(query, key, position_ids,
+                                       self.cos_sin_cache, self.rotary_dim,
+                                       False)
+
+        return query, key
+
+
 def get_rope_impl(
    rotary_dim: int,
    base: 10000.0,
-    position_ids,
+    position_ids: paddle.Tensor,
    model_config: Optional[ModelConfig] = None,
    partial_rotary_factor=1,
-):
+) -> paddle.Tensor:
    """
    The real implementation of get_rope
    """

    architecture = model_config.architectures[0]
-    if model_config is not None and model_config is None or architecture.startswith(
-            "Qwen"):
+    if model_config is None or architecture.startswith("Qwen"):
        rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base,
                                               partial_rotary_factor)
        rotary_emb = rotary_emb_layer(position_ids)
@@ -126,10 +275,10 @@ def get_rope_impl(
 def get_rope_xpu(
    rotary_dim: int,
    base: 10000.0,
-    position_ids,
-    model_config: ModelConfig,
+    position_ids: paddle.Tensor,
+    model_config: Optional[ModelConfig] = None,
    partial_rotary_factor=1,
-):
+) -> paddle.Tensor:
    """
    In XPU, cos and sin compute must be done on cpu
    """
@@ -143,12 +292,27 @@ def get_rope_xpu(
 def get_rope(
    rotary_dim: int,
    base: 10000.0,
-    position_ids,
-    model_config: ModelConfig,
-    partial_rotary_factor=1,
-):
+    position_ids: paddle.Tensor,
+    model_config: Optional[ModelConfig] = None,
+    partial_rotary_factor: int = 1,
+) -> paddle.Tensor:
    """
-    The warpper of get_rope
+    Pre-calculate rotary position embedding for position_ids.
+
+    Args:
+        rotary_dim (int):
+            Dimension of rotary embeddings (head dimension)
+        base (float, optional):
+            Base value used to compute the inverse frequencies.
+            Default: 10000.0.
+        position_ids (paddle.Tensor):
+            Tensor containing position indices of input tokens.
+        model_config (Optional[ModelConfig]):
+            Model configuration object containing architecture information.
+            If provided, determines RoPE implementation based on model architecture.
+        partial_rotary_factor (int, optional):
+            Factor controlling partial rotary application.
+            Default: 1 (apply to all dimensions).
    """
    if current_platform.is_xpu():
        return get_rope_xpu(rotary_dim, base, position_ids, model_config,
@@ -255,7 +419,24 @@ def get_rope_3d(
    paritial_rotary_factor: 1,
    max_position: 131072,
    freq_allocation: 2,
-):
+) -> paddle.Tensor:
+    """
+    Pre-calculate rotary position embedding for position_ids.
+
+    Args:
+        rotary_dim (int):
+            Dimension of rotary embeddings (head dimension)
+        base (float, optional):
+            Base value used to compute the inverse frequencies.
+            Default: 10000.0.
+        position_ids (paddle.Tensor):
+            Tensor containing position indices of input tokens.
+        partial_rotary_factor (int, optional):
+            Factor controlling partial rotary application.
+            Default: 1 (apply to all dimensions).
+        max_position: Maximum position index to precompute.
+        freq_allocation: Number of rotary dimensions allocated to temporal axis
+    """
    rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(rotary_dim, base,
                                                  paritial_rotary_factor,
                                                  max_position,