[FIX]Fix Machete compile via ENABLE_MACHETE (#3727)

* add ENABLE_MACHETE

* fix

* revert

* update

* pre_commit

* fix

* fix

---------

Co-authored-by: Ayakouji <yuhongh@qq.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Co-authored-by: aquagull <hongyuh@qq.com>
This commit is contained in:
Sunny-bot1
2025-08-30 17:50:17 +08:00
committed by GitHub
parent b9af95cf1c
commit fe5d09f9ee
4 changed files with 17 additions and 9 deletions

View File

@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("recv_expert_count"), py::arg("block_size"), py::arg("recv_expert_count"), py::arg("block_size"),
"per token per block quant"); "per token per block quant");
#ifdef ENABLE_MACHETE
/*machete/machete_mm.cu /*machete/machete_mm.cu
* machete_mm * machete_mm
*/ */
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
* machete_supported_schedules * machete_supported_schedules
*/ */
m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function"); m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
#endif
/** /**
* moe/fused_moe/moe_topk_select.cu * moe/fused_moe/moe_topk_select.cu

View File

@@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda():
if not os.listdir(json_dir): if not os.listdir(json_dir):
raise ValueError("Git clone nlohmann_json failed!") raise ValueError("Git clone nlohmann_json failed!")
cc_compile_args = []
nvcc_compile_args = get_gencode_flags(archs) nvcc_compile_args = get_gencode_flags(archs)
nvcc_compile_args += ["-DPADDLE_DEV"] nvcc_compile_args += ["-DPADDLE_DEV"]
nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"] nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
@@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda():
sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu") sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
os.system("python gpu_ops/machete/generate.py") os.system("python gpu_ops/machete/generate.py")
sources += find_end_files("gpu_ops/machete", ".cu") sources += find_end_files("gpu_ops/machete", ".cu")
cc_compile_args += ["-DENABLE_MACHETE"]
setup( setup(
name="fastdeploy_ops", name="fastdeploy_ops",
ext_modules=CUDAExtension( ext_modules=CUDAExtension(
sources=sources, sources=sources,
extra_compile_args={"nvcc": nvcc_compile_args}, extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
libraries=["cublasLt"], libraries=["cublasLt"],
extra_link_args=["-lcuda"], extra_link_args=["-lcuda"],
), ),

View File

@@ -26,8 +26,14 @@ def get_sm_version():
return cc return cc
_ENABLE_MACHETE = False
if current_platform.is_cuda() and get_sm_version() == 90: if current_platform.is_cuda() and get_sm_version() == 90:
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B try:
from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
_ENABLE_MACHETE = True
except Exception:
pass
def get_pack_factor(num_bits): def get_pack_factor(num_bits):

View File

@@ -34,12 +34,6 @@ from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase from .quant_base import QuantConfigBase, QuantMethodBase
def get_sm_version():
prop = paddle.device.cuda.get_device_properties()
cc = prop.major * 10 + prop.minor
return cc
class WeightOnlyConfig(QuantConfigBase): class WeightOnlyConfig(QuantConfigBase):
""" """
Quantization config for weight only Quantization config for weight only
@@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase):
else: else:
raise ValueError(f"Unsupported MOE backend {layer.use_method}") raise ValueError(f"Unsupported MOE backend {layer.use_method}")
else: else:
from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
_ENABLE_MACHETE,
)
if ( if (
self.name() == "wint4" self.name() == "wint4"
and _ENABLE_MACHETE
and envs.FD_USE_MACHETE == "1" and envs.FD_USE_MACHETE == "1"
and get_sm_version() == 90
and layer.weight_shape[1] and layer.weight_shape[1]
and layer.weight_shape[1] % 128 == 0 and layer.weight_shape[1] % 128 == 0
): ):