【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)

* support MergedReplicatedLinear

* update MergedReplicatedLinear to support DSK_wint4 V1_load

* update model name

* update linear class

* fix

* fix v0 moe_bias load

---------

Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
This commit is contained in:
AIbin
2025-09-05 12:16:05 +08:00
committed by GitHub
parent b23fc654d9
commit 41aee08982
4 changed files with 102 additions and 4 deletions

View File

@@ -58,6 +58,19 @@ model_param_map = {
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
],
},
"DeepSeek-V3-0324": {
"tensor_parallel_size": 2,
"quantizations": [
{
"quant_type": "wint4",
"env": {
"FD_ATTENTION_BACKEND": "MLA_ATTN",
"FLAGS_mla_use_tensorcore": "1",
"FLAGS_flash_attn_version": "3",
},
},
],
},
}