From fe5d09f9ee9b8ccb72ae343de684b75a42d2a6e0 Mon Sep 17 00:00:00 2001
From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com>
Date: Sat, 30 Aug 2025 17:50:17 +0800
Subject: [PATCH] [FIX]Fix Machete compile via ENABLE_MACHETE (#3727)

* add ENABLE_MACHETE

* fix

* revert

* update

* pre_commit

* fix

* fix

---------

Co-authored-by: Ayakouji <yuhongh@qq.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Co-authored-by: aquagull <hongyuh@qq.com>
---
 custom_ops/gpu_ops/cpp_extensions.cc                 |  2 ++
 custom_ops/setup_ops.py                              |  4 +++-
 .../layers/quantization/ops/machete_mm.py            |  8 +++++++-
 .../layers/quantization/weight_only.py               | 12 +++++-------
 4 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index b0fe0400f..027a33dc0 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -986,6 +986,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("recv_expert_count"), py::arg("block_size"),
         "per token per block quant");
 
+#ifdef ENABLE_MACHETE
   /*machete/machete_mm.cu
    * machete_mm
    */
@@ -1004,6 +1005,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
    * machete_supported_schedules
    */
   m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
+#endif
 
   /**
    * moe/fused_moe/moe_topk_select.cu
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 3a27d64c5..a0757d180 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -373,6 +373,7 @@ elif paddle.is_compiled_with_cuda():
         if not os.listdir(json_dir):
             raise ValueError("Git clone nlohmann_json failed!")
 
+    cc_compile_args = []
     nvcc_compile_args = get_gencode_flags(archs)
     nvcc_compile_args += ["-DPADDLE_DEV"]
     nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
@@ -519,12 +520,13 @@ elif paddle.is_compiled_with_cuda():
         sources += find_end_files("gpu_ops/wfp8afp8_sparse_gemm", ".cu")
         os.system("python gpu_ops/machete/generate.py")
         sources += find_end_files("gpu_ops/machete", ".cu")
+        cc_compile_args += ["-DENABLE_MACHETE"]
 
     setup(
         name="fastdeploy_ops",
         ext_modules=CUDAExtension(
             sources=sources,
-            extra_compile_args={"nvcc": nvcc_compile_args},
+            extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
             libraries=["cublasLt"],
             extra_link_args=["-lcuda"],
         ),
diff --git a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
index 57ed4a4bd..218da0d21 100644
--- a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py
@@ -26,8 +26,14 @@ def get_sm_version():
     return cc
 
 
+_ENABLE_MACHETE = False
 if current_platform.is_cuda() and get_sm_version() == 90:
-    from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+    try:
+        from fastdeploy.model_executor.ops.gpu import machete_mm, machete_prepack_B
+
+        _ENABLE_MACHETE = True
+    except Exception:
+        pass
 
 
 def get_pack_factor(num_bits):
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
index 717c933f5..89c0a5d88 100644
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -34,12 +34,6 @@ from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 
-def get_sm_version():
-    prop = paddle.device.cuda.get_device_properties()
-    cc = prop.major * 10 + prop.minor
-    return cc
-
-
 class WeightOnlyConfig(QuantConfigBase):
     """
     Quantization config for weight only
@@ -139,10 +133,14 @@ class WeightOnlyConfig(QuantConfigBase):
                 else:
                     raise ValueError(f"Unsupported MOE backend {layer.use_method}")
             else:
+                from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
+                    _ENABLE_MACHETE,
+                )
+
                 if (
                     self.name() == "wint4"
+                    and _ENABLE_MACHETE
                     and envs.FD_USE_MACHETE == "1"
-                    and get_sm_version() == 90
                     and layer.weight_shape[1]
                     and layer.weight_shape[1] % 128 == 0
                 ):