FastDeploy/fastdeploy/model_executor/layers/utils.py

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from typing import Tuple

import numpy as np
import paddle
from paddle import Tensor
from paddle.framework import in_dynamic_mode
from fastdeploy.platforms import current_platform
if current_platform.is_cuda() and current_platform.available():
    try:
        from fastdeploy.model_executor.ops.gpu import (
            get_padding_offset,
            speculate_get_padding_offset,
        )
    except Exception:
        raise ImportError(
            f"Verify environment consistency between compilation and FastDeploy installation. "
            f"And ensure the Paddle version supports FastDeploy's custom operators"
        )
import re

import os
cache_params = os.getenv("CACHE_PARAMS", "none")
if cache_params != "none":
    c8_state_dict = paddle.load(cache_params, return_numpy=True)

def per_block_cast_to_fp8(x: Tensor) -> Tuple[Tensor, Tensor]:
    """
    Only used in deep_gemm block wise quant weight.
    copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py.
    """
    from fastdeploy.model_executor.ops.gpu.deep_gemm import ceil_div

    assert x.dim() == 2
    m, n = x.shape
    x_padded = paddle.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
                            dtype=x.dtype)
    x_padded[:m, :n] = x
    x_view = paddle.view(x_padded, (-1, 128, x_padded.shape[1] // 128, 128))

    x_abs = paddle.abs(x_view).astype(paddle.float32)
    x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True)
    x_amax = paddle.clip(x_amax, min=1e-4)
    x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn)

    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (paddle.view(
        x_amax / 448.0, (x_view.shape[0], x_view.shape[2])))


# for distributed tensor model parallel
def _set_var_distributed(var, split_axis):
    """
    Set whether the variable is distributed. If the variable is None, no operation will be performed.

    Args:
    var (Variable, Optional): A Variable object, which can be None. The default value is None.
    The Variable object should have an attribute 'is_distributed' to indicate whether
    the variable has been processed in a distributed manner.
    split_axis (Integer): the sharding dimension of dist tensors

    Returns:
    None. No return value.

    """
    if var is None:
        return

    var.is_distributed = True
    var.split_axis = split_axis

    if not in_dynamic_mode():
        # NOTE: use current_block and find_var_recursive to support while_loop
        startup_block = paddle.static.default_startup_program().current_block()
        main_block = paddle.static.default_main_program().current_block()
        startup_block._find_var_recursive(var.name).is_distributed = True
        main_block._find_var_recursive(var.name).is_distributed = True


def get_tensor(input):
    """
    EP并行中，权重按层分布式存储，为了节省峰值显存，在state_dict处理部分仅保存
    层名与对应权重的路径，因此需要将权重的类型转换为paddle.Tensor
    """
    if isinstance(input, paddle.Tensor):
        if input.place.is_cpu_place():
            return input.to(paddle.device.get_device())
        return input
    elif isinstance(input, np.ndarray):
        return paddle.to_tensor(input)
    elif isinstance(input, str):
        if ".safetensors" in input:

            match = re.match(r"\[(.*?)\](.*)", input)
            if match:
                key_name = match.group(1)
                model_path = match.group(2)
            from safetensors import safe_open

            with safe_open(model_path, framework="np", device="cpu") as f:
                if key_name in f.keys():
                    weight = f.get_tensor(key_name)
                    weight = paddle.Tensor(weight, zero_copy=True)
                    weight = weight._copy_to(
                        paddle.framework._current_expected_place(), False
                    )
                    return weight
                else:
                    return None
        else:
            if cache_params != "none":
                tmp_key = input.split("/")[-1]
                if tmp_key in c8_state_dict:
                    print(f"Loading {tmp_key} in extra C8_state_dict")
                    return paddle.to_tensor(c8_state_dict.pop(tmp_key))
            return paddle.load(input)
    else:
        # 理论上不会命中这个分支
        return input


def ensure_divisibility(numerator, denominator):
    """Ensure that numerator is divisible by the denominator."""
    assert numerator % denominator == 0, "{} is not divisible by {}".format(
        numerator, denominator)


def divide(numerator, denominator):
    """Ensure that numerator is divisible by the denominator and return
    the division value."""
    ensure_divisibility(numerator, denominator)
    return numerator // denominator

def remove_padding(max_len, input_ids, seq_lens_this_time):
    """
    remove_padding
    """
    if current_platform.is_cuda():
        cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
        token_num = paddle.sum(seq_lens_this_time)
        (
            ids_remove_padding,
            cum_offsets,
            padding_offset,
            cu_seqlens_q,
            cu_seqlens_k,
        ) = get_padding_offset(input_ids, cum_offsets_now, token_num,
                                seq_lens_this_time)
        return (
            ids_remove_padding,
            padding_offset,
            cum_offsets,
            cu_seqlens_q,
            cu_seqlens_k,
        )

def speculate_remove_padding(max_len, input_ids, seq_lens_this_time,
                                    draft_tokens, seq_lens_encoder):
    """
    remove_padding
    """
    if current_platform.is_cuda():
        cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
        token_num = paddle.sum(seq_lens_this_time)
        (
            ids_remove_padding,
            cum_offsets,
            padding_offset,
            cu_seqlens_q,
            cu_seqlens_k,
        ) = speculate_get_padding_offset(
            input_ids,
            draft_tokens,
            cum_offsets_now,
            token_num,
            seq_lens_this_time,
            seq_lens_encoder,
        )
        return (
            ids_remove_padding,
            padding_offset,
            cum_offsets,
            cu_seqlens_q,
            cu_seqlens_k,
        )