From fe5d09f9ee9b8ccb72ae343de684b75a42d2a6e0 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com> Date: Sat, 30 Aug 2025 17:50:17 +0800 Subject: [PATCH] [FIX]Fix Machete compile via ENABLE_MACHETE (#3727) * add ENABLE_MACHETE * fix * revert * update * pre_commit * fix * fix --------- Co-authored-by: Ayakouji Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: aquagull --- custom_ops/gpu_ops/cpp_extensions.cc | 2 ++ custom_ops/setup_ops.py | 4 +++- .../layers/quantization/ops/machete_mm.py | 8 +++++++- .../layers/quantization/weight_only.py | 12 +++++------- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index b0fe0400f..027a33dc0 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("recv_expert_count"), py::arg("block_size"), "per token per block quant"); +#ifdef ENABLE_MACHETE /*machete/machete_mm.cu * machete_mm */ @@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * machete_supported_schedules */ m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function"); +#endif /** * moe/fused_moe/moe_topk_select.cu diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 3a27d64c5..a0757d180 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda(): if not os.listdir(json_dir): raise ValueError("Git clone nlohmann_json failed!") + cc_compile_args = [] nvcc_compile_args = get_gencode_flags(archs) nvcc_compile_args += ["-DPADDLE_DEV"] nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"] @@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda(): sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu") os.system("python gpu_ops/machete/generate.py") sources += find_end_files("gpu_ops/machete", ".cu") + cc_compile_args += ["-DENABLE_MACHETE"] setup( name="fastdeploy_ops", ext_modules=CUDAExtension( sources=sources, - extra_compile_args={"nvcc": nvcc_compile_args}, + extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args}, libraries=["cublasLt"], extra_link_args=["-lcuda"], ), diff --git a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py index 57ed4a4bd..218da0d21 100644 --- a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py +++ b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py @@ -26,8 +26,14 @@ def get_sm_version(): return cc +_ENABLE_MACHETE = False if current_platform.is_cuda() and get_sm_version() == 90: - from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B + try: + from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B + + _ENABLE_MACHETE = True + except Exception: + pass def get_pack_factor(num_bits): diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 717c933f5..89c0a5d88 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -34,12 +34,6 @@ from ..utils import get_tensor from .quant_base import QuantConfigBase, QuantMethodBase -def get_sm_version(): - prop = paddle.device.cuda.get_device_properties() - cc = prop.major * 10 + prop.minor - return cc - - class WeightOnlyConfig(QuantConfigBase): """ Quantization config for weight only @@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase): else: raise ValueError(f"Unsupported MOE backend {layer.use_method}") else: + from fastdeploy.model_executor.layers.quantization.ops.machete_mm import ( + _ENABLE_MACHETE, + ) + if ( self.name() == "wint4" + and _ENABLE_MACHETE and envs.FD_USE_MACHETE == "1" - and get_sm_version() == 90 and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 ):