mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 12:52:29 +08:00
[FIX]Fix Machete compile via ENABLE_MACHETE (#3727)
* add ENABLE_MACHETE * fix * revert * update * pre_commit * fix * fix --------- Co-authored-by: Ayakouji <yuhongh@qq.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: aquagull <hongyuh@qq.com>
This commit is contained in:
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
py::arg("recv_expert_count"), py::arg("block_size"),
|
py::arg("recv_expert_count"), py::arg("block_size"),
|
||||||
"per token per block quant");
|
"per token per block quant");
|
||||||
|
|
||||||
|
#ifdef ENABLE_MACHETE
|
||||||
/*machete/machete_mm.cu
|
/*machete/machete_mm.cu
|
||||||
* machete_mm
|
* machete_mm
|
||||||
*/
|
*/
|
||||||
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
* machete_supported_schedules
|
* machete_supported_schedules
|
||||||
*/
|
*/
|
||||||
m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
|
m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* moe/fused_moe/moe_topk_select.cu
|
* moe/fused_moe/moe_topk_select.cu
|
||||||
|
@@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda():
|
|||||||
if not os.listdir(json_dir):
|
if not os.listdir(json_dir):
|
||||||
raise ValueError("Git clone nlohmann_json failed!")
|
raise ValueError("Git clone nlohmann_json failed!")
|
||||||
|
|
||||||
|
cc_compile_args = []
|
||||||
nvcc_compile_args = get_gencode_flags(archs)
|
nvcc_compile_args = get_gencode_flags(archs)
|
||||||
nvcc_compile_args += ["-DPADDLE_DEV"]
|
nvcc_compile_args += ["-DPADDLE_DEV"]
|
||||||
nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
|
nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
|
||||||
@@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda():
|
|||||||
sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
|
sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
|
||||||
os.system("python gpu_ops/machete/generate.py")
|
os.system("python gpu_ops/machete/generate.py")
|
||||||
sources += find_end_files("gpu_ops/machete", ".cu")
|
sources += find_end_files("gpu_ops/machete", ".cu")
|
||||||
|
cc_compile_args += ["-DENABLE_MACHETE"]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="fastdeploy_ops",
|
name="fastdeploy_ops",
|
||||||
ext_modules=CUDAExtension(
|
ext_modules=CUDAExtension(
|
||||||
sources=sources,
|
sources=sources,
|
||||||
extra_compile_args={"nvcc": nvcc_compile_args},
|
extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
|
||||||
libraries=["cublasLt"],
|
libraries=["cublasLt"],
|
||||||
extra_link_args=["-lcuda"],
|
extra_link_args=["-lcuda"],
|
||||||
),
|
),
|
||||||
|
@@ -26,8 +26,14 @@ def get_sm_version():
|
|||||||
return cc
|
return cc
|
||||||
|
|
||||||
|
|
||||||
|
_ENABLE_MACHETE = False
|
||||||
if current_platform.is_cuda() and get_sm_version() == 90:
|
if current_platform.is_cuda() and get_sm_version() == 90:
|
||||||
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
|
try:
|
||||||
|
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
|
||||||
|
|
||||||
|
_ENABLE_MACHETE = True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_pack_factor(num_bits):
|
def get_pack_factor(num_bits):
|
||||||
|
@@ -34,12 +34,6 @@ from ..utils import get_tensor
|
|||||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||||
|
|
||||||
|
|
||||||
def get_sm_version():
|
|
||||||
prop = paddle.device.cuda.get_device_properties()
|
|
||||||
cc = prop.major * 10 + prop.minor
|
|
||||||
return cc
|
|
||||||
|
|
||||||
|
|
||||||
class WeightOnlyConfig(QuantConfigBase):
|
class WeightOnlyConfig(QuantConfigBase):
|
||||||
"""
|
"""
|
||||||
Quantization config for weight only
|
Quantization config for weight only
|
||||||
@@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
||||||
else:
|
else:
|
||||||
|
from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
|
||||||
|
_ENABLE_MACHETE,
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.name() == "wint4"
|
self.name() == "wint4"
|
||||||
|
and _ENABLE_MACHETE
|
||||||
and envs.FD_USE_MACHETE == "1"
|
and envs.FD_USE_MACHETE == "1"
|
||||||
and get_sm_version() == 90
|
|
||||||
and layer.weight_shape[1]
|
and layer.weight_shape[1]
|
||||||
and layer.weight_shape[1] % 128 == 0
|
and layer.weight_shape[1] % 128 == 0
|
||||||
):
|
):
|
||||||
|
Reference in New Issue
Block a user