FastDeploy/custom_ops/setup_ops.py

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" setup for FastDeploy custom ops """
import glob
import json
import os
import shutil
import subprocess
import tarfile

import paddle
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
from setuptools import find_namespace_packages, find_packages

archs = json.loads(os.getenv("BUILDING_ARCS", "[]"))
use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"


def download_and_extract(url, destination_directory):
    """
    Download a .tar.gz file using wget to the destination directory
    and extract its contents without renaming the downloaded file.

    :param url: The URL of the .tar.gz file to download.
    :param destination_directory: The directory where the file should be downloaded and extracted.
    """
    os.makedirs(destination_directory, exist_ok=True)

    filename = os.path.basename(url)
    file_path = os.path.join(destination_directory, filename)

    try:
        subprocess.run(
            ["wget", "-O", file_path, url],
            check=True,
        )
        print(f"Downloaded: {file_path}")

        with tarfile.open(file_path, "r:gz") as tar:
            tar.extractall(path=destination_directory)
            print(f"Extracted: {file_path} to {destination_directory}")
        os.remove(file_path)
        print(f"Deleted downloaded file: {file_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading file: {e}")
    except Exception as e:
        print(f"Error extracting file: {e}")


def clone_git_repo(version, repo_url, destination_path):
    """
    Clone git repo to destination path.
    """
    try:
        subprocess.run(
            [
                "git",
                "clone",
                "-b",
                version,
                "--single-branch",
                repo_url,
                destination_path,
            ],
            check=True,
        )
        return True
    except subprocess.CalledProcessError:
        return False


def get_sm_version(archs):
    """
    Get sm version of paddle.
    """
    arch_set = set(archs)
    try:
        prop = paddle.device.cuda.get_device_properties()
        cc = prop.major * 10 + prop.minor
        arch_set.add(cc)
    except ValueError:
        pass
    return list(arch_set)


def get_gencode_flags(archs):
    """
    Get gencode flags for current device or input.
    """
    cc_s = get_sm_version(archs)
    flags = []
    for cc in cc_s:
        if cc == 90:
            cc = f"{cc}a"
            flags += ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)]
        else:
            flags += ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)]
    return flags


def find_end_files(directory, end_str):
    """
    Find files with end str in directory.
    """
    gen_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(end_str):
                gen_files.append(os.path.join(root, file))
    return gen_files


if paddle.is_compiled_with_rocm():
    # NOTE(@duanyanhui): paddle.is_compiled_with_cuda() returns True when paddle compiled with rocm.
    # so we need to check if paddle compiled with rocm at first.
    setup(
        name="fastdeploy_ops",
        ext_modules=CUDAExtension(
            sources=[
                "gpu_ops/save_with_output.cc",
                "gpu_ops/set_mask_value.cu",
                "gpu_ops/set_value_by_flags.cu",
                "gpu_ops/ngram_mask.cu",
                "gpu_ops/gather_idx.cu",
                "gpu_ops/token_penalty_multi_scores.cu",
                "gpu_ops/token_penalty_only_once.cu",
                "gpu_ops/stop_generation.cu",
                "gpu_ops/stop_generation_multi_ends.cu",
                "gpu_ops/stop_generation_multi_stop_seqs.cu",
                "gpu_ops/set_flags.cu",
                "gpu_ops/fused_get_rope.cu",
                "gpu_ops/transfer_output.cc",
                "gpu_ops/get_padding_offset.cu",
                "gpu_ops/update_inputs.cu",
                "gpu_ops/update_inputs_beam.cu",
                "gpu_ops/beam_search_softmax.cu",
                "gpu_ops/rebuild_padding.cu",
                "gpu_ops/save_with_output_msg.cc",
                "gpu_ops/get_output.cc",
                "gpu_ops/get_output_msg_with_topk.cc",
                "gpu_ops/reset_need_stop_value.cc",
                "gpu_ops/step.cu",
                "gpu_ops/step_reschedule.cu",
                "gpu_ops/set_data_ipc.cu",
                "gpu_ops/read_data_ipc.cu",
                "gpu_ops/dequant_int8.cu",
                "gpu_ops/enforce_generation.cu",
                "gpu_ops/tune_cublaslt_gemm.cu",
            ],
            extra_compile_args={
                "cxx": ["-O3"],
                "hipcc": [
                    "-O3",
                    "--gpu-max-threads-per-block=1024",
                    "-U__HIP_NO_HALF_OPERATORS__",
                    "-U__HIP_NO_HALF_CONVERSIONS__",
                    "-U__HIP_NO_BFLOAT16_OPERATORS__",
                    "-U__HIP_NO_BFLOAT16_CONVERSIONS__",
                    "-U__HIP_NO_BFLOAT162_OPERATORS__",
                    "-U__HIP_NO_BFLOAT162_CONVERSIONS__",
                ],
            },
        ),
    )
elif paddle.is_compiled_with_cuda():
    sources = [
        "gpu_ops/set_mask_value.cu", "gpu_ops/set_value_by_flags.cu",
        "gpu_ops/ngram_mask.cu", "gpu_ops/gather_idx.cu",
        "gpu_ops/get_output_ep.cc", "gpu_ops/get_mm_split_fuse.cc",
        "gpu_ops/token_penalty_multi_scores.cu",
        "gpu_ops/token_penalty_only_once.cu", "gpu_ops/stop_generation.cu",
        "gpu_ops/stop_generation_multi_ends.cu",
        "gpu_ops/stop_generation_multi_stop_seqs.cu", "gpu_ops/set_flags.cu",
        "gpu_ops/step.cu", "gpu_ops/step_reschedule.cu",
        "gpu_ops/fused_get_rope.cu", "gpu_ops/get_padding_offset.cu",
        "gpu_ops/update_inputs.cu", "gpu_ops/update_inputs_beam.cu",
        "gpu_ops/beam_search_softmax.cu", "gpu_ops/rebuild_padding.cu",
        "gpu_ops/set_data_ipc.cu", "gpu_ops/read_data_ipc.cu",
        "gpu_ops/enforce_generation.cu", "gpu_ops/dequant_int8.cu",
        "gpu_ops/tune_cublaslt_gemm.cu", "gpu_ops/swap_cache_batch.cu",
        "gpu_ops/swap_cache.cu", "gpu_ops/step_system_cache.cu",
        "gpu_ops/cpp_extensions.cu", "gpu_ops/share_external_data.cu",
        "gpu_ops/per_token_quant_fp8.cu",
        "gpu_ops/extract_text_token_output.cu",
        "gpu_ops/update_split_fuse_input.cu"
    ]

    # pd_disaggregation
    sources += [
        "gpu_ops/remote_cache_kv_ipc.cc",
        "gpu_ops/open_shm_and_get_meta_signal.cc",
        "gpu_ops/init_signal_layerwise.cc",
    ]

    cutlass_dir = "third_party/cutlass"
    if not os.path.exists(cutlass_dir) or not os.listdir(cutlass_dir):
        if not os.path.exists(cutlass_dir):
            os.makedirs(cutlass_dir)
        clone_git_repo("v3.8.0", "https://github.com/NVIDIA/cutlass.git",
                       cutlass_dir)
        if not os.listdir(cutlass_dir):
            raise ValueError("Git clone cutlass failed!")

    # deep gemm
    dg_third_party_include_dirs = (
        "third_party/cutlass/include/cute",
        "third_party/cutlass/include/cutlass",
    )

    dg_include_dir = "gpu_ops/fp8_deep_gemm/deep_gemm/include"
    os.makedirs(dg_include_dir, exist_ok=True)

    for d in dg_third_party_include_dirs:
        dirname = d.split("/")[-1]
        src_dir = d
        dst_dir = os.path.join(dg_include_dir, dirname)

        # Remove existing directory if it exists
        if os.path.exists(dst_dir):
            if os.path.islink(dst_dir):
                os.unlink(dst_dir)
            else:
                shutil.rmtree(dst_dir)
        print(f"Copying {src_dir} to {dst_dir}")

        # Copy the directory
        try:
            shutil.copytree(src_dir, dst_dir)
        except Exception as e:
            raise RuntimeError(
                f"Failed to copy from {src_dir} to {dst_dir}: {e}")

    json_dir = "third_party/nlohmann_json"
    if not os.path.exists(json_dir) or not os.listdir(json_dir):
        if not os.path.exists(json_dir):
            os.makedirs(json_dir)
        clone_git_repo("v3.11.3", "https://github.com/nlohmann/json.git",
                       json_dir)
        if not os.listdir(json_dir):
            raise ValueError("Git clone nlohmann_json failed!")

    nvcc_compile_args = get_gencode_flags(archs)
    nvcc_compile_args += ["-DPADDLE_DEV"]
    nvcc_compile_args += [
        "-Igpu_ops/cutlass_kernels",
        "-Ithird_party/cutlass/include",
        "-Igpu_ops/fp8_gemm_with_cutlass",
        "-Igpu_ops",
        "-Ithird_party/nlohmann_json/include",
    ]
    cc = max(get_sm_version(archs))
    print(f"cc = {cc}")
    if cc >= 80:
        # append_attention
        sources += ["gpu_ops/append_attention.cu"]
        sources += find_end_files("gpu_ops/append_attn", ".cu")
        # gemm_dequant
        sources += ["gpu_ops/int8_gemm_with_cutlass/gemm_dequant.cu"]
        # speculate_decoding
        sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
        sources += find_end_files("gpu_ops/speculate_decoding", ".cc")
        nvcc_compile_args += ["-DENABLE_BF16"]
        # moe
        sources += find_end_files("gpu_ops/cutlass_kernels/moe_gemm/", ".cu")
        sources += find_end_files("gpu_ops/cutlass_kernels/w4a8_moe/", ".cu")
        sources += find_end_files("gpu_ops/moe/", ".cu")
        nvcc_compile_args += ["-Igpu_ops/moe"]

    if cc >= 89:
        # Running generate fp8 gemm codes.
        nvcc_compile_args += ["-DENABLE_FP8"]
        os.system("python auto_gen_fp8_fp8_gemm_fused_kernels.py")
        os.system("python auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
        os.system("python auto_gen_visitor_fp8_gemm_fused_kernels.py")

        nvcc_compile_args += [
            "-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
        ]

        sources += [
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu",
            "gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_gemm_scale_bias_act.cu",
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_fp8_dual_gemm.cu",
            "gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_dual_gemm_scale_bias_act.cu",
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu",
            "gpu_ops/fp8_gemm_with_cutlass/per_channel_fp8_fp8_half_gemm.cu",
            "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu",
            "gpu_ops/scaled_gemm_f8_i4_f16_gemm.cu",
            "gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
            "gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
            "gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
            "gpu_ops/air_topp_sampling.cu",
        ]
    if cc >= 90:
        nvcc_compile_args += [
            "-gencode",
            "arch=compute_90a,code=compute_90a",
            "-O3",
            "-DNDEBUG",
        ]
        os.system("python auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py")
        sources += ["gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_block_gemm.cu"]

    # for fp8 autogen *.cu
    if cc >= 89:
        sources += find_end_files(
            "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", ".cu")

    setup(
        name="fastdeploy_ops",
        ext_modules=CUDAExtension(
            sources=sources,
            extra_compile_args={"nvcc": nvcc_compile_args},
            libraries=["cublasLt"],
        ),
        packages=find_packages(where="gpu_ops/fp8_deep_gemm"),
        package_dir={"": "gpu_ops/fp8_deep_gemm"},
        package_data={
            "deep_gemm": [
                "include/deep_gemm/**/*",
                "include/cute/**/*",
                "include/cutlass/**/*",
            ]
        },
        include_package_data=True,
    )
elif paddle.is_compiled_with_xpu():
    # TODO zhangsishuai@baidu.com to add xpu ops
    setup(
        name="fastdeploy_ops",
        ext_modules=CUDAExtension(sources=[
            "xpu_ops/set_mask_value.cu",
            "xpu_ops/set_value_by_flags.cu",
            "xpu_ops/ngram_mask.cu",
            "xpu_ops/gather_idx.cu",
            "xpu_ops/token_penalty_multi_scores.cu",
            "xpu_ops/token_penalty_only_once.cu",
        ]),
    )
else:
    use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
    x86_simd_sort_dir = "third_party/x86-simd-sort"
    if not os.path.exists(x86_simd_sort_dir) or not os.listdir(
            x86_simd_sort_dir):
        x86_simd_sort_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/x86-simd-sort.tar.gz"
        download_and_extract(x86_simd_sort_url, "third_party")
    xft_dir = "third_party/xFasterTransformer"
    if not os.path.exists(xft_dir) or not os.listdir(xft_dir):
        if use_bf16:
            xft_url = (
                "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft.tar.gz"
            )
        else:
            xft_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft_no_bf16.tar.gz"
        download_and_extract(xft_url, "third_party")

    libs = [
        "xfastertransformer",
        "xft_comm_helper",
        "x86simdsortcpp",
    ]
    xft_dir = "third_party/xFasterTransformer"
    x86_simd_sort_dir = "third_party/x86-simd-sort"
    paddle_custom_kernel_include = [
        os.path.join(xft_dir, "include"),
        os.path.join(xft_dir, "src/common"),  # src
        os.path.join(xft_dir, "src/kernels"),  # src
        os.path.join(xft_dir, "src/layers"),  # src
        os.path.join(xft_dir, "src/models"),  # src
        os.path.join(xft_dir, "src/utils"),  # src
        os.path.join(xft_dir, "3rdparty/onednn/include"),  # src
        os.path.join(xft_dir, "3rdparty/onednn/build/include"),  # src
        os.path.join(xft_dir, "3rdparty/xdnn"),  # src
        os.path.join(xft_dir, "3rdparty"),
        os.path.join(xft_dir, "3rdparty/mkl/include"),
        os.path.join(x86_simd_sort_dir, "src"),  # src
    ]

    # cc flags
    paddle_extra_compile_args = [
        "-std=c++17",
        "-shared",
        "-fPIC",
        "-Wno-parentheses",
        "-DPADDLE_WITH_CUSTOM_KERNEL",
        "-mavx512f",
        "-mavx512vl",
        "-fopenmp",
        "-mavx512bw",
        "-mno-mmx",
        "-Wall",
        "-march=skylake-avx512",
        "-O3",
        "-g",
        "-lstdc++fs",
        "-D_GLIBCXX_USE_CXX11_ABI=1",
    ]
    if use_bf16:
        # avx512-bf16 flags
        paddle_extra_compile_args += [
            "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
            "-DAVX512_FP16_WEIGHT_ONLY_INT8=true",
            "-DAVX512_FP16_WEIGHT_ONLY_FP16=true",
        ]
    else:
        # no avx512-bf16 flags
        paddle_extra_compile_args += [
            "-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
            "-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
        ]
    paddle_custom_kernel_library_dir = [
        "third_party/xFasterTransformer/build/",
        "third_party/x86-simd-sort/builddir",
    ]

    include_files = []
    for include_dir in paddle_custom_kernel_include:
        include_files.extend(glob.glob(os.path.join(include_dir, "*.h")))
    so_files = []
    for library_dir in paddle_custom_kernel_library_dir:
        if os.path.isdir(library_dir):
            for lib in libs:
                lib_file = os.path.join(library_dir, f"lib{lib}.so")
                if os.path.isfile(lib_file):
                    so_files.append(lib_file)
    setup(
        name="fastdeploy_cpu_ops",
        ext_modules=CppExtension(
            sources=[
                "cpu_ops/simd_sort.cc",
                "cpu_ops/set_value_by_flags.cc",
                "cpu_ops/token_penalty_multi_scores.cc",
                "cpu_ops/stop_generation_multi_ends.cc",
                "cpu_ops/update_inputs.cc",
                "cpu_ops/get_padding_offset.cc",
                "cpu_ops/xft_all_layer.cc",
                "cpu_ops/xft_greedy_search.cc",
                "cpu_ops/avx_weight_only.cc",
            ],
            extra_link_args=[
                "-Wl,-rpath,$ORIGIN/x86-simd-sort/builddir",
                "-Wl,-rpath,$ORIGIN/xFasterTransformer/build",
            ],
            include_dirs=paddle_custom_kernel_include,
            library_dirs=paddle_custom_kernel_library_dir,
            libraries=libs,
            extra_compile_args=paddle_extra_compile_args,
        ),
        packages=find_namespace_packages(where="third_party"),
        package_dir={"": "third_party"},
        package_data={"fastdeploy_cpu_ops": include_files + so_files},
        include_package_data=True,
    )