Sync v2.0 version of code to github repo

2025-12-24 13:28:13 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ setup for FastDeploy custom ops """
-import glob
+import importlib
 import json
 import os
 import shutil
 import subprocess
+import sys
 import tarfile
+from pathlib import Path

 import paddle
 from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
 from setuptools import find_namespace_packages, find_packages

-archs = json.loads(os.getenv("BUILDING_ARCS", "[]"))
-use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
+
+def load_module_from_path(module_name, path):
+    """
+    load python module from path
+    """
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+ROOT_DIR = Path(__file__).parent.parent
+
+# cannot import envs directly because it depends on fastdeploy,
+#  which is not installed yet
+envs = load_module_from_path('envs',
+                             os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py'))
+
+archs = json.loads(envs.FD_BUILDING_ARCS)
+use_bf16 = envs.FD_CPU_USE_BF16 == "True"


 def download_and_extract(url, destination_directory):
@@ -80,6 +101,30 @@ def clone_git_repo(version, repo_url, destination_path):
        return False


+def process_git_repo(cur_path, dst_path, commit_id=None, patch=None):
+    """
+    reset git repo to destination commit and apply patch.
+    """
+    if commit_id is not None:
+        reset_cmd = ["git", "reset", "--hard", commit_id]
+    if patch is not None:
+        patch_source = os.path.join(cur_path, patch)
+        patch_destination = os.path.join(dst_path, patch)
+        shutil.copy(patch_source, patch_destination)
+        apply_cmd = ["git", "apply", patch]
+
+    try:
+        os.chdir(dst_path)
+        if commit_id is not None:
+            subprocess.run(reset_cmd, check=True)
+        if patch is not None:
+            subprocess.run(apply_cmd, check=True)
+        os.chdir(cur_path)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
 def get_sm_version(archs):
    """
    Get sm version of paddle.
@@ -94,6 +139,18 @@ def get_sm_version(archs):
    return list(arch_set)


+def get_nvcc_version():
+    """
+    Get cuda version of nvcc.
+    """
+    nvcc_output = subprocess.check_output(["nvcc", "--version"],
+                                          universal_newlines=True)
+    output = nvcc_output.split()
+    release_idx = output.index("release") + 1
+    nvcc_cuda_version = float(output[release_idx].split(",")[0])
+    return nvcc_cuda_version
+
+
 def get_gencode_flags(archs):
    """
    Get gencode flags for current device or input.
@@ -149,7 +206,6 @@ if paddle.is_compiled_with_rocm():
                "gpu_ops/save_with_output_msg.cc",
                "gpu_ops/get_output.cc",
                "gpu_ops/get_output_msg_with_topk.cc",
-                "gpu_ops/reset_need_stop_value.cc",
                "gpu_ops/step.cu",
                "gpu_ops/step_reschedule.cu",
                "gpu_ops/set_data_ipc.cu",
@@ -175,25 +231,42 @@ if paddle.is_compiled_with_rocm():
    )
 elif paddle.is_compiled_with_cuda():
    sources = [
-        "gpu_ops/set_mask_value.cu", "gpu_ops/set_value_by_flags.cu",
-        "gpu_ops/ngram_mask.cu", "gpu_ops/gather_idx.cu",
-        "gpu_ops/get_output_ep.cc", "gpu_ops/get_mm_split_fuse.cc",
+        "gpu_ops/set_mask_value.cu",
+        "gpu_ops/set_value_by_flags.cu",
+        "gpu_ops/ngram_mask.cu",
+        "gpu_ops/gather_idx.cu",
+        "gpu_ops/get_output_ep.cc",
+        "gpu_ops/get_mm_split_fuse.cc",
        "gpu_ops/token_penalty_multi_scores.cu",
-        "gpu_ops/token_penalty_only_once.cu", "gpu_ops/stop_generation.cu",
+        "gpu_ops/token_penalty_only_once.cu",
+        "gpu_ops/stop_generation.cu",
        "gpu_ops/stop_generation_multi_ends.cu",
-        "gpu_ops/stop_generation_multi_stop_seqs.cu", "gpu_ops/set_flags.cu",
-        "gpu_ops/step.cu", "gpu_ops/step_reschedule.cu",
-        "gpu_ops/fused_get_rope.cu", "gpu_ops/get_padding_offset.cu",
-        "gpu_ops/update_inputs.cu", "gpu_ops/update_inputs_beam.cu",
-        "gpu_ops/beam_search_softmax.cu", "gpu_ops/rebuild_padding.cu",
-        "gpu_ops/set_data_ipc.cu", "gpu_ops/read_data_ipc.cu",
-        "gpu_ops/enforce_generation.cu", "gpu_ops/dequant_int8.cu",
-        "gpu_ops/tune_cublaslt_gemm.cu", "gpu_ops/swap_cache_batch.cu",
-        "gpu_ops/swap_cache.cu", "gpu_ops/step_system_cache.cu",
-        "gpu_ops/cpp_extensions.cu", "gpu_ops/share_external_data.cu",
+        "gpu_ops/stop_generation_multi_stop_seqs.cu",
+        "gpu_ops/set_flags.cu",
+        "gpu_ops/step.cu",
+        "gpu_ops/step_reschedule.cu",
+        "gpu_ops/fused_get_rope.cu",
+        "gpu_ops/get_padding_offset.cu",
+        "gpu_ops/update_inputs.cu",
+        "gpu_ops/update_inputs_beam.cu",
+        "gpu_ops/beam_search_softmax.cu",
+        "gpu_ops/rebuild_padding.cu",
+        "gpu_ops/set_data_ipc.cu",
+        "gpu_ops/read_data_ipc.cu",
+        "gpu_ops/enforce_generation.cu",
+        "gpu_ops/dequant_int8.cu",
+        "gpu_ops/tune_cublaslt_gemm.cu",
+        "gpu_ops/swap_cache_batch.cu",
+        "gpu_ops/swap_cache.cu",
+        "gpu_ops/step_system_cache.cu",
+        "gpu_ops/cpp_extensions.cc",
+        "gpu_ops/share_external_data.cu",
        "gpu_ops/per_token_quant_fp8.cu",
        "gpu_ops/extract_text_token_output.cu",
-        "gpu_ops/update_split_fuse_input.cu"
+        "gpu_ops/update_split_fuse_input.cu",
+        "gpu_ops/text_image_index_out.cu",
+        "gpu_ops/text_image_gather_scatter.cu",
+        "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
    ]

    # pd_disaggregation
@@ -201,6 +274,8 @@ elif paddle.is_compiled_with_cuda():
        "gpu_ops/remote_cache_kv_ipc.cc",
        "gpu_ops/open_shm_and_get_meta_signal.cc",
        "gpu_ops/init_signal_layerwise.cc",
+        "gpu_ops/get_data_ptr_ipc.cu",
+        "gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu",
    ]

    cutlass_dir = "third_party/cutlass"
@@ -213,12 +288,26 @@ elif paddle.is_compiled_with_cuda():
            raise ValueError("Git clone cutlass failed!")

    # deep gemm
+    deep_gemm_dir = "third_party/DeepGEMM"
+    if not os.path.exists(deep_gemm_dir) or not os.listdir(deep_gemm_dir):
+        if not os.path.exists(deep_gemm_dir):
+            os.makedirs(deep_gemm_dir)
+        clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git",
+                       deep_gemm_dir)
+        if not os.listdir(deep_gemm_dir):
+            raise ValueError("Git clone DeepGEMM failed!")
+        cur_path = os.path.dirname(os.path.abspath(__file__))
+        dst_path = os.path.join(cur_path, deep_gemm_dir)
+        commit_id = "95e81b3dd6704e279e5f4757c5b94776ac988a8d"
+        patch = "0001-DeepGEMM-95e81b3.patch"
+        process_git_repo(cur_path, dst_path, commit_id, patch)
+
    dg_third_party_include_dirs = (
        "third_party/cutlass/include/cute",
        "third_party/cutlass/include/cutlass",
    )

-    dg_include_dir = "gpu_ops/fp8_deep_gemm/deep_gemm/include"
+    dg_include_dir = "third_party/DeepGEMM/deep_gemm/include"
    os.makedirs(dg_include_dir, exist_ok=True)

    for d in dg_third_party_include_dirs:
@@ -252,15 +341,37 @@ elif paddle.is_compiled_with_cuda():

    nvcc_compile_args = get_gencode_flags(archs)
    nvcc_compile_args += ["-DPADDLE_DEV"]
+    nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
+    nvcc_compile_args += ["-DPy_LIMITED_API=0x03090000"]
    nvcc_compile_args += [
        "-Igpu_ops/cutlass_kernels",
        "-Ithird_party/cutlass/include",
+        "-Ithird_party/cutlass/tools/util/include",
        "-Igpu_ops/fp8_gemm_with_cutlass",
        "-Igpu_ops",
        "-Ithird_party/nlohmann_json/include",
    ]
+    nvcc_version = get_nvcc_version()
+    print(f'nvcc_version = {nvcc_version}')
+    if nvcc_version >= 12.0:
+        sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"]
    cc = max(get_sm_version(archs))
    print(f"cc = {cc}")
+    fp8_auto_gen_directory = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
+    if os.path.isdir(fp8_auto_gen_directory):
+        shutil.rmtree(fp8_auto_gen_directory)
+
+    if cc >= 75:
+        nvcc_compile_args += [
+            "-DENABLE_SCALED_MM_C2X=1",
+            "-Igpu_ops/cutlass_kernels/w8a8",
+        ]
+        sources += [
+            "gpu_ops/cutlass_kernels/w8a8/scaled_mm_entry.cu",
+            "gpu_ops/cutlass_kernels/w8a8/scaled_mm_c2x.cu",
+            "gpu_ops/quantization/common.cu",
+        ]
+
    if cc >= 80:
        # append_attention
        sources += ["gpu_ops/append_attention.cu"]
@@ -280,19 +391,45 @@ elif paddle.is_compiled_with_cuda():
    if cc >= 89:
        # Running generate fp8 gemm codes.
        nvcc_compile_args += ["-DENABLE_FP8"]
-        os.system("python auto_gen_fp8_fp8_gemm_fused_kernels.py")
-        os.system("python auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
-        os.system("python auto_gen_visitor_fp8_gemm_fused_kernels.py")
-
        nvcc_compile_args += [
            "-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
        ]

+        os.system("python utils/auto_gen_visitor_fp8_gemm_fused_kernels.py")
+        if cc < 90:
+            os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
+            os.system(
+                "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
+        else:
+            nvcc_compile_args += [
+                "-gencode",
+                "arch=compute_90a,code=compute_90a",
+                "-O3",
+                "-DNDEBUG",
+            ]
+            os.system(
+                "python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py")
+            os.system(
+                "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py"
+            )
+            os.system(
+                "python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py"
+            )
+
+            nvcc_compile_args += [
+                "-DENABLE_SCALED_MM_SM90=1",
+            ]
+            sources += [
+                "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_block_gemm.cu",
+                "gpu_ops/cutlass_kernels/w8a8/scaled_mm_c3x_sm90.cu",
+                "gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_fp8.cu",
+                "gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_int8.cu",
+                "gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_azp_sm90_int8.cu",
+            ]
+
        sources += [
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu",
-            "gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_gemm_scale_bias_act.cu",
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_fp8_dual_gemm.cu",
-            "gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_dual_gemm_scale_bias_act.cu",
            "gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu",
            "gpu_ops/fp8_gemm_with_cutlass/per_channel_fp8_fp8_half_gemm.cu",
            "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu",
@@ -300,22 +437,9 @@ elif paddle.is_compiled_with_cuda():
            "gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
            "gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
            "gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
-            "gpu_ops/air_topp_sampling.cu",
        ]
-    if cc >= 90:
-        nvcc_compile_args += [
-            "-gencode",
-            "arch=compute_90a,code=compute_90a",
-            "-O3",
-            "-DNDEBUG",
-        ]
-        os.system("python auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py")
-        sources += ["gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_block_gemm.cu"]

-    # for fp8 autogen *.cu
-    if cc >= 89:
-        sources += find_end_files(
-            "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", ".cu")
+        sources += find_end_files(fp8_auto_gen_directory, ".cu")

    setup(
        name="fastdeploy_ops",
@@ -324,8 +448,8 @@ elif paddle.is_compiled_with_cuda():
            extra_compile_args={"nvcc": nvcc_compile_args},
            libraries=["cublasLt"],
        ),
-        packages=find_packages(where="gpu_ops/fp8_deep_gemm"),
-        package_dir={"": "gpu_ops/fp8_deep_gemm"},
+        packages=find_packages(where="third_party/DeepGEMM"),
+        package_dir={"": "third_party/DeepGEMM"},
        package_data={
            "deep_gemm": [
                "include/deep_gemm/**/*",
@@ -336,56 +460,9 @@ elif paddle.is_compiled_with_cuda():
        include_package_data=True,
    )
 elif paddle.is_compiled_with_xpu():
-    # TODO zhangsishuai@baidu.com to add xpu ops
-    setup(
-        name="fastdeploy_ops",
-        ext_modules=CUDAExtension(sources=[
-            "xpu_ops/set_mask_value.cu",
-            "xpu_ops/set_value_by_flags.cu",
-            "xpu_ops/ngram_mask.cu",
-            "xpu_ops/gather_idx.cu",
-            "xpu_ops/token_penalty_multi_scores.cu",
-            "xpu_ops/token_penalty_only_once.cu",
-        ]),
-    )
+    assert False, "In XPU, we should use setup_ops.py in xpu_ops/src, not this."
 else:
-    use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
-    x86_simd_sort_dir = "third_party/x86-simd-sort"
-    if not os.path.exists(x86_simd_sort_dir) or not os.listdir(
-            x86_simd_sort_dir):
-        x86_simd_sort_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/x86-simd-sort.tar.gz"
-        download_and_extract(x86_simd_sort_url, "third_party")
-    xft_dir = "third_party/xFasterTransformer"
-    if not os.path.exists(xft_dir) or not os.listdir(xft_dir):
-        if use_bf16:
-            xft_url = (
-                "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft.tar.gz"
-            )
-        else:
-            xft_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft_no_bf16.tar.gz"
-        download_and_extract(xft_url, "third_party")
-
-    libs = [
-        "xfastertransformer",
-        "xft_comm_helper",
-        "x86simdsortcpp",
-    ]
-    xft_dir = "third_party/xFasterTransformer"
-    x86_simd_sort_dir = "third_party/x86-simd-sort"
-    paddle_custom_kernel_include = [
-        os.path.join(xft_dir, "include"),
-        os.path.join(xft_dir, "src/common"),  # src
-        os.path.join(xft_dir, "src/kernels"),  # src
-        os.path.join(xft_dir, "src/layers"),  # src
-        os.path.join(xft_dir, "src/models"),  # src
-        os.path.join(xft_dir, "src/utils"),  # src
-        os.path.join(xft_dir, "3rdparty/onednn/include"),  # src
-        os.path.join(xft_dir, "3rdparty/onednn/build/include"),  # src
-        os.path.join(xft_dir, "3rdparty/xdnn"),  # src
-        os.path.join(xft_dir, "3rdparty"),
-        os.path.join(xft_dir, "3rdparty/mkl/include"),
-        os.path.join(x86_simd_sort_dir, "src"),  # src
-    ]
+    use_bf16 = envs.FD_CPU_USE_BF16 == "True"

    # cc flags
    paddle_extra_compile_args = [
@@ -394,46 +471,15 @@ else:
        "-fPIC",
        "-Wno-parentheses",
        "-DPADDLE_WITH_CUSTOM_KERNEL",
-        "-mavx512f",
-        "-mavx512vl",
-        "-fopenmp",
-        "-mavx512bw",
-        "-mno-mmx",
+        "-DPADDLE_ON_INFERENCE",
        "-Wall",
-        "-march=skylake-avx512",
        "-O3",
        "-g",
        "-lstdc++fs",
        "-D_GLIBCXX_USE_CXX11_ABI=1",
-    ]
-    if use_bf16:
-        # avx512-bf16 flags
-        paddle_extra_compile_args += [
-            "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
-            "-DAVX512_FP16_WEIGHT_ONLY_INT8=true",
-            "-DAVX512_FP16_WEIGHT_ONLY_FP16=true",
-        ]
-    else:
-        # no avx512-bf16 flags
-        paddle_extra_compile_args += [
-            "-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
-            "-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
-        ]
-    paddle_custom_kernel_library_dir = [
-        "third_party/xFasterTransformer/build/",
-        "third_party/x86-simd-sort/builddir",
+        "-DPy_LIMITED_API=0x03090000",
    ]

-    include_files = []
-    for include_dir in paddle_custom_kernel_include:
-        include_files.extend(glob.glob(os.path.join(include_dir, "*.h")))
-    so_files = []
-    for library_dir in paddle_custom_kernel_library_dir:
-        if os.path.isdir(library_dir):
-            for lib in libs:
-                lib_file = os.path.join(library_dir, f"lib{lib}.so")
-                if os.path.isfile(lib_file):
-                    so_files.append(lib_file)
    setup(
        name="fastdeploy_cpu_ops",
        ext_modules=CppExtension(
@@ -444,21 +490,14 @@ else:
                "cpu_ops/stop_generation_multi_ends.cc",
                "cpu_ops/update_inputs.cc",
                "cpu_ops/get_padding_offset.cc",
-                "cpu_ops/xft_all_layer.cc",
-                "cpu_ops/xft_greedy_search.cc",
-                "cpu_ops/avx_weight_only.cc",
            ],
            extra_link_args=[
                "-Wl,-rpath,$ORIGIN/x86-simd-sort/builddir",
                "-Wl,-rpath,$ORIGIN/xFasterTransformer/build",
            ],
-            include_dirs=paddle_custom_kernel_include,
-            library_dirs=paddle_custom_kernel_library_dir,
-            libraries=libs,
            extra_compile_args=paddle_extra_compile_args,
        ),
        packages=find_namespace_packages(where="third_party"),
        package_dir={"": "third_party"},
-        package_data={"fastdeploy_cpu_ops": include_files + so_files},
        include_package_data=True,
    )