Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions

View File

@@ -1,4 +1,4 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -12,19 +12,40 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" setup for FastDeploy custom ops """
import glob
import importlib
import json
import os
import shutil
import subprocess
import sys
import tarfile
from pathlib import Path
import paddle
from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
from setuptools import find_namespace_packages, find_packages
archs = json.loads(os.getenv("BUILDING_ARCS", "[]"))
use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
def load_module_from_path(module_name, path):
"""
load python module from path
"""
spec = importlib.util.spec_from_file_location(module_name, path)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
ROOT_DIR = Path(__file__).parent.parent
# cannot import envs directly because it depends on fastdeploy,
# which is not installed yet
envs = load_module_from_path('envs',
os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py'))
archs = json.loads(envs.FD_BUILDING_ARCS)
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
def download_and_extract(url, destination_directory):
@@ -80,6 +101,30 @@ def clone_git_repo(version, repo_url, destination_path):
return False
def process_git_repo(cur_path, dst_path, commit_id=None, patch=None):
"""
reset git repo to destination commit and apply patch.
"""
if commit_id is not None:
reset_cmd = ["git", "reset", "--hard", commit_id]
if patch is not None:
patch_source = os.path.join(cur_path, patch)
patch_destination = os.path.join(dst_path, patch)
shutil.copy(patch_source, patch_destination)
apply_cmd = ["git", "apply", patch]
try:
os.chdir(dst_path)
if commit_id is not None:
subprocess.run(reset_cmd, check=True)
if patch is not None:
subprocess.run(apply_cmd, check=True)
os.chdir(cur_path)
return True
except subprocess.CalledProcessError:
return False
def get_sm_version(archs):
"""
Get sm version of paddle.
@@ -94,6 +139,18 @@ def get_sm_version(archs):
return list(arch_set)
def get_nvcc_version():
"""
Get cuda version of nvcc.
"""
nvcc_output = subprocess.check_output(["nvcc", "--version"],
universal_newlines=True)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = float(output[release_idx].split(",")[0])
return nvcc_cuda_version
def get_gencode_flags(archs):
"""
Get gencode flags for current device or input.
@@ -149,7 +206,6 @@ if paddle.is_compiled_with_rocm():
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/reset_need_stop_value.cc",
"gpu_ops/step.cu",
"gpu_ops/step_reschedule.cu",
"gpu_ops/set_data_ipc.cu",
@@ -175,25 +231,42 @@ if paddle.is_compiled_with_rocm():
)
elif paddle.is_compiled_with_cuda():
sources = [
"gpu_ops/set_mask_value.cu", "gpu_ops/set_value_by_flags.cu",
"gpu_ops/ngram_mask.cu", "gpu_ops/gather_idx.cu",
"gpu_ops/get_output_ep.cc", "gpu_ops/get_mm_split_fuse.cc",
"gpu_ops/set_mask_value.cu",
"gpu_ops/set_value_by_flags.cu",
"gpu_ops/ngram_mask.cu",
"gpu_ops/gather_idx.cu",
"gpu_ops/get_output_ep.cc",
"gpu_ops/get_mm_split_fuse.cc",
"gpu_ops/token_penalty_multi_scores.cu",
"gpu_ops/token_penalty_only_once.cu", "gpu_ops/stop_generation.cu",
"gpu_ops/token_penalty_only_once.cu",
"gpu_ops/stop_generation.cu",
"gpu_ops/stop_generation_multi_ends.cu",
"gpu_ops/stop_generation_multi_stop_seqs.cu", "gpu_ops/set_flags.cu",
"gpu_ops/step.cu", "gpu_ops/step_reschedule.cu",
"gpu_ops/fused_get_rope.cu", "gpu_ops/get_padding_offset.cu",
"gpu_ops/update_inputs.cu", "gpu_ops/update_inputs_beam.cu",
"gpu_ops/beam_search_softmax.cu", "gpu_ops/rebuild_padding.cu",
"gpu_ops/set_data_ipc.cu", "gpu_ops/read_data_ipc.cu",
"gpu_ops/enforce_generation.cu", "gpu_ops/dequant_int8.cu",
"gpu_ops/tune_cublaslt_gemm.cu", "gpu_ops/swap_cache_batch.cu",
"gpu_ops/swap_cache.cu", "gpu_ops/step_system_cache.cu",
"gpu_ops/cpp_extensions.cu", "gpu_ops/share_external_data.cu",
"gpu_ops/stop_generation_multi_stop_seqs.cu",
"gpu_ops/set_flags.cu",
"gpu_ops/step.cu",
"gpu_ops/step_reschedule.cu",
"gpu_ops/fused_get_rope.cu",
"gpu_ops/get_padding_offset.cu",
"gpu_ops/update_inputs.cu",
"gpu_ops/update_inputs_beam.cu",
"gpu_ops/beam_search_softmax.cu",
"gpu_ops/rebuild_padding.cu",
"gpu_ops/set_data_ipc.cu",
"gpu_ops/read_data_ipc.cu",
"gpu_ops/enforce_generation.cu",
"gpu_ops/dequant_int8.cu",
"gpu_ops/tune_cublaslt_gemm.cu",
"gpu_ops/swap_cache_batch.cu",
"gpu_ops/swap_cache.cu",
"gpu_ops/step_system_cache.cu",
"gpu_ops/cpp_extensions.cc",
"gpu_ops/share_external_data.cu",
"gpu_ops/per_token_quant_fp8.cu",
"gpu_ops/extract_text_token_output.cu",
"gpu_ops/update_split_fuse_input.cu"
"gpu_ops/update_split_fuse_input.cu",
"gpu_ops/text_image_index_out.cu",
"gpu_ops/text_image_gather_scatter.cu",
"gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
]
# pd_disaggregation
@@ -201,6 +274,8 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/remote_cache_kv_ipc.cc",
"gpu_ops/open_shm_and_get_meta_signal.cc",
"gpu_ops/init_signal_layerwise.cc",
"gpu_ops/get_data_ptr_ipc.cu",
"gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu",
]
cutlass_dir = "third_party/cutlass"
@@ -213,12 +288,26 @@ elif paddle.is_compiled_with_cuda():
raise ValueError("Git clone cutlass failed!")
# deep gemm
deep_gemm_dir = "third_party/DeepGEMM"
if not os.path.exists(deep_gemm_dir) or not os.listdir(deep_gemm_dir):
if not os.path.exists(deep_gemm_dir):
os.makedirs(deep_gemm_dir)
clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git",
deep_gemm_dir)
if not os.listdir(deep_gemm_dir):
raise ValueError("Git clone DeepGEMM failed!")
cur_path = os.path.dirname(os.path.abspath(__file__))
dst_path = os.path.join(cur_path, deep_gemm_dir)
commit_id = "95e81b3dd6704e279e5f4757c5b94776ac988a8d"
patch = "0001-DeepGEMM-95e81b3.patch"
process_git_repo(cur_path, dst_path, commit_id, patch)
dg_third_party_include_dirs = (
"third_party/cutlass/include/cute",
"third_party/cutlass/include/cutlass",
)
dg_include_dir = "gpu_ops/fp8_deep_gemm/deep_gemm/include"
dg_include_dir = "third_party/DeepGEMM/deep_gemm/include"
os.makedirs(dg_include_dir, exist_ok=True)
for d in dg_third_party_include_dirs:
@@ -252,15 +341,37 @@ elif paddle.is_compiled_with_cuda():
nvcc_compile_args = get_gencode_flags(archs)
nvcc_compile_args += ["-DPADDLE_DEV"]
nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
nvcc_compile_args += ["-DPy_LIMITED_API=0x03090000"]
nvcc_compile_args += [
"-Igpu_ops/cutlass_kernels",
"-Ithird_party/cutlass/include",
"-Ithird_party/cutlass/tools/util/include",
"-Igpu_ops/fp8_gemm_with_cutlass",
"-Igpu_ops",
"-Ithird_party/nlohmann_json/include",
]
nvcc_version = get_nvcc_version()
print(f'nvcc_version = {nvcc_version}')
if nvcc_version >= 12.0:
sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"]
cc = max(get_sm_version(archs))
print(f"cc = {cc}")
fp8_auto_gen_directory = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
if os.path.isdir(fp8_auto_gen_directory):
shutil.rmtree(fp8_auto_gen_directory)
if cc >= 75:
nvcc_compile_args += [
"-DENABLE_SCALED_MM_C2X=1",
"-Igpu_ops/cutlass_kernels/w8a8",
]
sources += [
"gpu_ops/cutlass_kernels/w8a8/scaled_mm_entry.cu",
"gpu_ops/cutlass_kernels/w8a8/scaled_mm_c2x.cu",
"gpu_ops/quantization/common.cu",
]
if cc >= 80:
# append_attention
sources += ["gpu_ops/append_attention.cu"]
@@ -280,19 +391,45 @@ elif paddle.is_compiled_with_cuda():
if cc >= 89:
# Running generate fp8 gemm codes.
nvcc_compile_args += ["-DENABLE_FP8"]
os.system("python auto_gen_fp8_fp8_gemm_fused_kernels.py")
os.system("python auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
os.system("python auto_gen_visitor_fp8_gemm_fused_kernels.py")
nvcc_compile_args += [
"-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
]
os.system("python utils/auto_gen_visitor_fp8_gemm_fused_kernels.py")
if cc < 90:
os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
os.system(
"python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
else:
nvcc_compile_args += [
"-gencode",
"arch=compute_90a,code=compute_90a",
"-O3",
"-DNDEBUG",
]
os.system(
"python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py")
os.system(
"python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py"
)
os.system(
"python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py"
)
nvcc_compile_args += [
"-DENABLE_SCALED_MM_SM90=1",
]
sources += [
"gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_block_gemm.cu",
"gpu_ops/cutlass_kernels/w8a8/scaled_mm_c3x_sm90.cu",
"gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_fp8.cu",
"gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_int8.cu",
"gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_azp_sm90_int8.cu",
]
sources += [
"gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu",
"gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_gemm_scale_bias_act.cu",
"gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_fp8_dual_gemm.cu",
"gpu_ops/cutlass_kernels/fp8_gemm_fused/fp8_fp8_dual_gemm_scale_bias_act.cu",
"gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu",
"gpu_ops/fp8_gemm_with_cutlass/per_channel_fp8_fp8_half_gemm.cu",
"gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu",
@@ -300,22 +437,9 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
"gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
"gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
"gpu_ops/air_topp_sampling.cu",
]
if cc >= 90:
nvcc_compile_args += [
"-gencode",
"arch=compute_90a,code=compute_90a",
"-O3",
"-DNDEBUG",
]
os.system("python auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py")
sources += ["gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_block_gemm.cu"]
# for fp8 autogen *.cu
if cc >= 89:
sources += find_end_files(
"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", ".cu")
sources += find_end_files(fp8_auto_gen_directory, ".cu")
setup(
name="fastdeploy_ops",
@@ -324,8 +448,8 @@ elif paddle.is_compiled_with_cuda():
extra_compile_args={"nvcc": nvcc_compile_args},
libraries=["cublasLt"],
),
packages=find_packages(where="gpu_ops/fp8_deep_gemm"),
package_dir={"": "gpu_ops/fp8_deep_gemm"},
packages=find_packages(where="third_party/DeepGEMM"),
package_dir={"": "third_party/DeepGEMM"},
package_data={
"deep_gemm": [
"include/deep_gemm/**/*",
@@ -336,56 +460,9 @@ elif paddle.is_compiled_with_cuda():
include_package_data=True,
)
elif paddle.is_compiled_with_xpu():
# TODO zhangsishuai@baidu.com to add xpu ops
setup(
name="fastdeploy_ops",
ext_modules=CUDAExtension(sources=[
"xpu_ops/set_mask_value.cu",
"xpu_ops/set_value_by_flags.cu",
"xpu_ops/ngram_mask.cu",
"xpu_ops/gather_idx.cu",
"xpu_ops/token_penalty_multi_scores.cu",
"xpu_ops/token_penalty_only_once.cu",
]),
)
assert False, "In XPU, we should use setup_ops.py in xpu_ops/src, not this."
else:
use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
x86_simd_sort_dir = "third_party/x86-simd-sort"
if not os.path.exists(x86_simd_sort_dir) or not os.listdir(
x86_simd_sort_dir):
x86_simd_sort_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/x86-simd-sort.tar.gz"
download_and_extract(x86_simd_sort_url, "third_party")
xft_dir = "third_party/xFasterTransformer"
if not os.path.exists(xft_dir) or not os.listdir(xft_dir):
if use_bf16:
xft_url = (
"https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft.tar.gz"
)
else:
xft_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft_no_bf16.tar.gz"
download_and_extract(xft_url, "third_party")
libs = [
"xfastertransformer",
"xft_comm_helper",
"x86simdsortcpp",
]
xft_dir = "third_party/xFasterTransformer"
x86_simd_sort_dir = "third_party/x86-simd-sort"
paddle_custom_kernel_include = [
os.path.join(xft_dir, "include"),
os.path.join(xft_dir, "src/common"), # src
os.path.join(xft_dir, "src/kernels"), # src
os.path.join(xft_dir, "src/layers"), # src
os.path.join(xft_dir, "src/models"), # src
os.path.join(xft_dir, "src/utils"), # src
os.path.join(xft_dir, "3rdparty/onednn/include"), # src
os.path.join(xft_dir, "3rdparty/onednn/build/include"), # src
os.path.join(xft_dir, "3rdparty/xdnn"), # src
os.path.join(xft_dir, "3rdparty"),
os.path.join(xft_dir, "3rdparty/mkl/include"),
os.path.join(x86_simd_sort_dir, "src"), # src
]
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
# cc flags
paddle_extra_compile_args = [
@@ -394,46 +471,15 @@ else:
"-fPIC",
"-Wno-parentheses",
"-DPADDLE_WITH_CUSTOM_KERNEL",
"-mavx512f",
"-mavx512vl",
"-fopenmp",
"-mavx512bw",
"-mno-mmx",
"-DPADDLE_ON_INFERENCE",
"-Wall",
"-march=skylake-avx512",
"-O3",
"-g",
"-lstdc++fs",
"-D_GLIBCXX_USE_CXX11_ABI=1",
]
if use_bf16:
# avx512-bf16 flags
paddle_extra_compile_args += [
"-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
"-DAVX512_FP16_WEIGHT_ONLY_INT8=true",
"-DAVX512_FP16_WEIGHT_ONLY_FP16=true",
]
else:
# no avx512-bf16 flags
paddle_extra_compile_args += [
"-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
"-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
]
paddle_custom_kernel_library_dir = [
"third_party/xFasterTransformer/build/",
"third_party/x86-simd-sort/builddir",
"-DPy_LIMITED_API=0x03090000",
]
include_files = []
for include_dir in paddle_custom_kernel_include:
include_files.extend(glob.glob(os.path.join(include_dir, "*.h")))
so_files = []
for library_dir in paddle_custom_kernel_library_dir:
if os.path.isdir(library_dir):
for lib in libs:
lib_file = os.path.join(library_dir, f"lib{lib}.so")
if os.path.isfile(lib_file):
so_files.append(lib_file)
setup(
name="fastdeploy_cpu_ops",
ext_modules=CppExtension(
@@ -444,21 +490,14 @@ else:
"cpu_ops/stop_generation_multi_ends.cc",
"cpu_ops/update_inputs.cc",
"cpu_ops/get_padding_offset.cc",
"cpu_ops/xft_all_layer.cc",
"cpu_ops/xft_greedy_search.cc",
"cpu_ops/avx_weight_only.cc",
],
extra_link_args=[
"-Wl,-rpath,$ORIGIN/x86-simd-sort/builddir",
"-Wl,-rpath,$ORIGIN/xFasterTransformer/build",
],
include_dirs=paddle_custom_kernel_include,
library_dirs=paddle_custom_kernel_library_dir,
libraries=libs,
extra_compile_args=paddle_extra_compile_args,
),
packages=find_namespace_packages(where="third_party"),
package_dir={"": "third_party"},
package_data={"fastdeploy_cpu_ops": include_files + so_files},
include_package_data=True,
)