[FIX]Fix Machete compile via ENABLE_MACHETE (#3727)

* add ENABLE_MACHETE * fix * revert * update * pre_commit * fix * fix --------- Co-authored-by: Ayakouji <yuhongh@qq.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: aquagull <hongyuh@qq.com>
2025-09-27 04:46:16 +08:00 · 2025-08-30 17:50:17 +08:00
parent b9af95cf1c
commit fe5d09f9ee
4 changed files with 17 additions and 9 deletions
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
        py::arg("recv_expert_count"), py::arg("block_size"),
        "per token per block quant");

+#ifdef ENABLE_MACHETE
  /*machete/machete_mm.cu
   * machete_mm
   */
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   * machete_supported_schedules
   */
  m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
+#endif

  /**
   * moe/fused_moe/moe_topk_select.cu
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda():
        if not os.listdir(json_dir):
            raise ValueError("Git clone nlohmann_json failed!")

+    cc_compile_args = []
    nvcc_compile_args = get_gencode_flags(archs)
    nvcc_compile_args += ["-DPADDLE_DEV"]
    nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
@@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda():
        sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
        os.system("python gpu_ops/machete/generate.py")
        sources += find_end_files("gpu_ops/machete", ".cu")
+        cc_compile_args += ["-DENABLE_MACHETE"]

    setup(
        name="fastdeploy_ops",
        ext_modules=CUDAExtension(
            sources=sources,
-            extra_compile_args={"nvcc": nvcc_compile_args},
+            extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
            libraries=["cublasLt"],
            extra_link_args=["-lcuda"],
        ),
--- a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
@@ -26,8 +26,14 @@ def get_sm_version():
    return cc


+_ENABLE_MACHETE = False
 if current_platform.is_cuda() and get_sm_version() == 90:
-    from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+    try:
+        from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+
+        _ENABLE_MACHETE = True
+    except Exception:
+        pass


 def get_pack_factor(num_bits):
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -34,12 +34,6 @@ from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase


-def get_sm_version():
-    prop = paddle.device.cuda.get_device_properties()
-    cc = prop.major * 10 + prop.minor
-    return cc
-
-
 class WeightOnlyConfig(QuantConfigBase):
    """
    Quantization config for weight only
@@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase):
                else:
                    raise ValueError(f"Unsupported MOE backend {layer.use_method}")
            else:
+                from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
+                    _ENABLE_MACHETE,
+                )
+
                if (
                    self.name() == "wint4"
+                    and _ENABLE_MACHETE
                    and envs.FD_USE_MACHETE == "1"
-                    and get_sm_version() == 90
                    and layer.weight_shape[1]
                    and layer.weight_shape[1] % 128 == 0
                ):