mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-16 05:30:58 +08:00
【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)
* support MergedReplicatedLinear * update MergedReplicatedLinear to support DSK_wint4 V1_load * update model name * update linear class * fix * fix v0 moe_bias load --------- Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
This commit is contained in:
@@ -58,6 +58,19 @@ model_param_map = {
|
||||
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
|
||||
],
|
||||
},
|
||||
"DeepSeek-V3-0324": {
|
||||
"tensor_parallel_size": 2,
|
||||
"quantizations": [
|
||||
{
|
||||
"quant_type": "wint4",
|
||||
"env": {
|
||||
"FD_ATTENTION_BACKEND": "MLA_ATTN",
|
||||
"FLAGS_mla_use_tensorcore": "1",
|
||||
"FLAGS_flash_attn_version": "3",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user