【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)

* support MergedReplicatedLinear

* update MergedReplicatedLinear to support DSK_wint4 V1_load

* update model name

* update linear class

* fix

* fix v0 moe_bias load

---------

Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
This commit is contained in:
AIbin
2025-09-05 12:16:05 +08:00
committed by GitHub
parent b23fc654d9
commit 41aee08982
4 changed files with 102 additions and 4 deletions

View File

@@ -24,6 +24,7 @@ from paddle.nn.quant import weight_only_linear, weight_quantize
from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear,
MergedReplicatedLinear,
QKVParallelLinear,
)
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
@@ -203,11 +204,15 @@ class WeightOnlyLinearMethod(QuantMethodBase):
default_initializer=paddle.nn.initializer.Constant(0),
)
quant_attrs = extra_weight_attrs
if isinstance(layer, MergedColumnParallelLinear) or isinstance(layer, QKVParallelLinear):
if (
isinstance(layer, MergedColumnParallelLinear)
or isinstance(layer, QKVParallelLinear)
or isinstance(layer, MergedReplicatedLinear)
):
quant_attrs = {
**extra_weight_attrs,
"tensor_track": TensorTracker(
shape=layer.weight_shape, output_dim=extra_weight_attrs.get("output_dim")
shape=layer.weight_shape, output_dim=extra_weight_attrs.get("output_dim", True)
),
}
set_weight_attrs(