【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)

* support MergedReplicatedLinear * update MergedReplicatedLinear to support DSK_wint4 V1_load * update model name * update linear class * fix * fix v0 moe_bias load --------- Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-10-05 08:37:06 +08:00 · 2025-09-05 12:16:05 +08:00
parent b23fc654d9
commit 41aee08982
4 changed files with 102 additions and 4 deletions
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -24,6 +24,7 @@ from paddle.nn.quant import weight_only_linear, weight_quantize
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear,
+    MergedReplicatedLinear,
    QKVParallelLinear,
 )
 from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
@@ -203,11 +204,15 @@ class WeightOnlyLinearMethod(QuantMethodBase):
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            quant_attrs = extra_weight_attrs
-            if isinstance(layer, MergedColumnParallelLinear) or isinstance(layer, QKVParallelLinear):
+            if (
+                isinstance(layer, MergedColumnParallelLinear)
+                or isinstance(layer, QKVParallelLinear)
+                or isinstance(layer, MergedReplicatedLinear)
+            ):
                quant_attrs = {
                    **extra_weight_attrs,
                    "tensor_track": TensorTracker(
-                        shape=layer.weight_shape, output_dim=extra_weight_attrs.get("output_dim")
+                        shape=layer.weight_shape, output_dim=extra_weight_attrs.get("output_dim", True)
                    ),
                }
            set_weight_attrs(