【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)

* support MergedReplicatedLinear * update MergedReplicatedLinear to support DSK_wint4 V1_load * update model name * update linear class * fix * fix v0 moe_bias load --------- Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-10-16 05:30:58 +08:00 · 2025-09-05 12:16:05 +08:00
parent b23fc654d9
commit 41aee08982
4 changed files with 102 additions and 4 deletions
--- a/tests/model_loader/test_common_model.py
+++ b/tests/model_loader/test_common_model.py
@@ -58,6 +58,19 @@ model_param_map = {
            {"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
        ],
    },
+    "DeepSeek-V3-0324": {
+        "tensor_parallel_size": 2,
+        "quantizations": [
+            {
+                "quant_type": "wint4",
+                "env": {
+                    "FD_ATTENTION_BACKEND": "MLA_ATTN",
+                    "FLAGS_mla_use_tensorcore": "1",
+                    "FLAGS_flash_attn_version": "3",
+                },
+            },
+        ],
+    },
 }