FastDeploy/fastdeploy/model_executor/ops/iluvatar/paged_attention.py

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import paddle

try:
    from fastdeploy.model_executor.ops.iluvatar import paged_attn
except ImportError:
    paged_attn = None


def paged_attention(
    q: paddle.Tensor,
    k_cache: paddle.Tensor,
    v_cache: paddle.Tensor,
    block_tables: paddle.Tensor,
    seq_lens: paddle.Tensor,
    num_kv_heads: int,
    scale: float,
    block_size: int,
    max_context_len: int,
    alibi_slopes: paddle.Tensor = None,
    causal: bool = True,
    window_left: int = -1,
    window_right: int = -1,
    softcap: float = 0.0,
    use_cuda_graph: bool = False,
    use_sqrt_alibi: bool = False,
    merged_qkv: bool = False,
    k: paddle.Tensor = None,
    v: paddle.Tensor = None,
    rope_sin: paddle.Tensor = None,
    rope_cos: paddle.Tensor = None,
):
    output = paged_attn(
        q,
        k_cache,
        v_cache,
        block_tables,
        seq_lens,
        alibi_slopes,
        k,
        v,
        rope_sin,
        rope_cos,
        num_kv_heads,
        scale,
        block_size,
        max_context_len,
        causal,
        window_left,
        window_right,
        softcap,
        use_cuda_graph,
        use_sqrt_alibi,
        merged_qkv,
    )
    return output[0] if isinstance(output, list) else output