WINT4/WINT8 dense gemm default use Machete (#4451)

2025-12-24 13:28:13 +08:00 · 2025-10-23 17:57:59 +08:00
parent a240425db9
commit 4ffe41a747
12 changed files with 310 additions and 15 deletions
--- a/docs/usage/environment_variables.md
+++ b/docs/usage/environment_variables.md
@@ -78,7 +78,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),

    # Whether to use Machete for wint4 dense GEMM.
-    "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
+    "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

    # Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
    "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),