mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 04:46:16 +08:00
[FIX]Fix Machete compile via ENABLE_MACHETE (#3727)
* add ENABLE_MACHETE * fix * revert * update * pre_commit * fix * fix --------- Co-authored-by: Ayakouji <yuhongh@qq.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: aquagull <hongyuh@qq.com>
This commit is contained in:
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("recv_expert_count"), py::arg("block_size"),
|
||||
"per token per block quant");
|
||||
|
||||
#ifdef ENABLE_MACHETE
|
||||
/*machete/machete_mm.cu
|
||||
* machete_mm
|
||||
*/
|
||||
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
* machete_supported_schedules
|
||||
*/
|
||||
m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
|
||||
#endif
|
||||
|
||||
/**
|
||||
* moe/fused_moe/moe_topk_select.cu
|
||||
|
@@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda():
|
||||
if not os.listdir(json_dir):
|
||||
raise ValueError("Git clone nlohmann_json failed!")
|
||||
|
||||
cc_compile_args = []
|
||||
nvcc_compile_args = get_gencode_flags(archs)
|
||||
nvcc_compile_args += ["-DPADDLE_DEV"]
|
||||
nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
|
||||
@@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda():
|
||||
sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
|
||||
os.system("python gpu_ops/machete/generate.py")
|
||||
sources += find_end_files("gpu_ops/machete", ".cu")
|
||||
cc_compile_args += ["-DENABLE_MACHETE"]
|
||||
|
||||
setup(
|
||||
name="fastdeploy_ops",
|
||||
ext_modules=CUDAExtension(
|
||||
sources=sources,
|
||||
extra_compile_args={"nvcc": nvcc_compile_args},
|
||||
extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
|
||||
libraries=["cublasLt"],
|
||||
extra_link_args=["-lcuda"],
|
||||
),
|
||||
|
@@ -26,8 +26,14 @@ def get_sm_version():
|
||||
return cc
|
||||
|
||||
|
||||
_ENABLE_MACHETE = False
|
||||
if current_platform.is_cuda() and get_sm_version() == 90:
|
||||
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
|
||||
try:
|
||||
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
|
||||
|
||||
_ENABLE_MACHETE = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_pack_factor(num_bits):
|
||||
|
@@ -34,12 +34,6 @@ from ..utils import get_tensor
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
|
||||
|
||||
def get_sm_version():
|
||||
prop = paddle.device.cuda.get_device_properties()
|
||||
cc = prop.major * 10 + prop.minor
|
||||
return cc
|
||||
|
||||
|
||||
class WeightOnlyConfig(QuantConfigBase):
|
||||
"""
|
||||
Quantization config for weight only
|
||||
@@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase):
|
||||
else:
|
||||
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
|
||||
_ENABLE_MACHETE,
|
||||
)
|
||||
|
||||
if (
|
||||
self.name() == "wint4"
|
||||
and _ENABLE_MACHETE
|
||||
and envs.FD_USE_MACHETE == "1"
|
||||
and get_sm_version() == 90
|
||||
and layer.weight_shape[1]
|
||||
and layer.weight_shape[1] % 128 == 0
|
||||
):
|
||||
|
Reference in New Issue
Block a user