[LLM] First commit the llm deployment code

This commit is contained in:
jiangjiajun
2025-06-09 19:20:15 +08:00
parent 980c0a1d2c
commit 684703fd72
11814 changed files with 127294 additions and 1293102 deletions

170
custom_ops/setup_ops_cpu.py Normal file
View File

@@ -0,0 +1,170 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" setup for FASTDEPLOY custom cpu ops """
import os
import subprocess
from paddle.utils.cpp_extension import setup, CppExtension
from setuptools import find_namespace_packages
import glob
import tarfile
BUILDING_ARCS = []
use_bf16 = os.getenv("CPU_USE_BF16", "False") == "True"
def download_and_extract(url, destination_directory):
"""
Download a .tar.gz file using wget to the destination directory
and extract its contents without renaming the downloaded file.
:param url: The URL of the .tar.gz file to download.
:param destination_directory: The directory where the file should be downloaded and extracted.
"""
os.makedirs(destination_directory, exist_ok=True)
filename = os.path.basename(url)
file_path = os.path.join(destination_directory, filename)
try:
subprocess.run(
["wget", "-O", file_path, url],
check=True,
)
print(f"Downloaded: {file_path}")
with tarfile.open(file_path, "r:gz") as tar:
tar.extractall(path=destination_directory)
print(f"Extracted: {file_path} to {destination_directory}")
os.remove(file_path)
print(f"Deleted downloaded file: {file_path}")
except subprocess.CalledProcessError as e:
print(f"Error downloading file: {e}")
except Exception as e:
print(f"Error extracting file: {e}")
x86_simd_sort_dir = "third_party/x86-simd-sort"
if not os.path.exists(x86_simd_sort_dir) or not os.listdir(x86_simd_sort_dir):
x86_simd_sort_url = (
"https://paddlepaddle-inference-banchmark.bj.bcebos.com/x86-simd-sort.tar.gz"
)
download_and_extract(x86_simd_sort_url, "third_party")
xft_dir = "third_party/xFasterTransformer"
if not os.path.exists(xft_dir) or not os.listdir(xft_dir):
if use_bf16:
xft_url = "https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft.tar.gz"
else:
xft_url = (
"https://paddlepaddle-inference-banchmark.bj.bcebos.com/xft_no_bf16.tar.gz"
)
download_and_extract(xft_url, "third_party")
libs = [
"xfastertransformer",
"xft_comm_helper",
"x86simdsortcpp",
]
xft_dir = "third_party/xFasterTransformer"
x86_simd_sort_dir = "third_party/x86-simd-sort"
paddle_custom_kernel_include = [
os.path.join(xft_dir, "include"),
os.path.join(xft_dir, "src/common"), # src
os.path.join(xft_dir, "src/kernels"), # src
os.path.join(xft_dir, "src/layers"), # src
os.path.join(xft_dir, "src/models"), # src
os.path.join(xft_dir, "src/utils"), # src
os.path.join(xft_dir, "3rdparty/onednn/include"), # src
os.path.join(xft_dir, "3rdparty/onednn/build/include"), # src
os.path.join(xft_dir, "3rdparty/xdnn"), # src
os.path.join(xft_dir, "3rdparty"),
os.path.join(xft_dir, "3rdparty/mkl/include"),
os.path.join(x86_simd_sort_dir, "src"), # src
]
# cc flags
paddle_extra_compile_args = [
"-std=c++17",
"-shared",
"-fPIC",
"-Wno-parentheses",
"-DPADDLE_WITH_CUSTOM_KERNEL",
"-mavx512f",
"-mavx512vl",
"-fopenmp",
"-mavx512bw",
"-mno-mmx",
"-Wall",
"-march=skylake-avx512",
"-O3",
"-g",
"-lstdc++fs",
"-D_GLIBCXX_USE_CXX11_ABI=1",
]
if use_bf16:
# avx512-bf16 flags
paddle_extra_compile_args += [
"-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
"-DAVX512_FP16_WEIGHT_ONLY_INT8=true",
"-DAVX512_FP16_WEIGHT_ONLY_FP16=true",
]
else:
# no avx512-bf16 flags
paddle_extra_compile_args += [
"-DAVX512_FP32_WEIGHT_ONLY_INT8=true",
"-DAVX512_FP32_WEIGHT_ONLY_FP16=true",
]
paddle_custom_kernel_library_dir = [
"third_party/xFasterTransformer/build/",
"third_party/x86-simd-sort/builddir",
]
include_files = []
for include_dir in paddle_custom_kernel_include:
include_files.extend(glob.glob(os.path.join(include_dir, "*.h")))
so_files = []
for library_dir in paddle_custom_kernel_library_dir:
if os.path.isdir(library_dir):
for lib in libs:
lib_file = os.path.join(library_dir, f"lib{lib}.so")
if os.path.isfile(lib_file):
so_files.append(lib_file)
setup(
name="fastdeploy_cpu_ops",
ext_modules=CppExtension(
sources=[
"cpu_ops/simd_sort.cc",
"cpu_ops/set_value_by_flags.cc",
"cpu_ops/token_penalty_multi_scores.cc",
"cpu_ops/stop_generation_multi_ends.cc",
"cpu_ops/update_inputs.cc",
"cpu_ops/get_padding_offset.cc",
"cpu_ops/xft_all_layer.cc",
"cpu_ops/xft_greedy_search.cc",
"cpu_ops/avx_weight_only.cc",
],
extra_link_args=[
"-Wl,-rpath,$ORIGIN/x86-simd-sort/builddir",
"-Wl,-rpath,$ORIGIN/xFasterTransformer/build",
],
include_dirs=paddle_custom_kernel_include,
library_dirs=paddle_custom_kernel_library_dir,
libraries=libs,
extra_compile_args=paddle_extra_compile_args,
),
packages=find_namespace_packages(where="third_party"),
package_dir={"": "third_party"},
package_data={"fastdeploy_cpu_ops": include_files + so_files},
include_package_data=True,
)