[Others] remove add_bias option (#5425)

2025-12-24 13:28:13 +08:00 · 2025-12-09 17:39:35 +08:00
parent 1f63000ef9
commit e397c4fba6
5 changed files with 6 additions and 30 deletions
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -97,7 +97,6 @@ class LinearBase(nn.Layer):
        input_size: int = None,
        output_size: int = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        skip_quant: bool = False,
        weight_dtype: str = "",
        weight_key: str = "",
@@ -112,7 +111,6 @@ class LinearBase(nn.Layer):
            input_size (int): Number of input features. Defaults to None.
            output_size (int): Number of output features. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            skip_quant (bool): Whether to skip quantization. Defaults to False.

        Raises:
@@ -137,7 +135,6 @@ class LinearBase(nn.Layer):
        self.input_size = input_size
        self.output_size = output_size
        self.with_bias = with_bias
-        self.add_bias = add_bias
        self.prefix = prefix
        self.is_quantized = fd_config.model_config.is_quantized and not (
            fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.dense_quant_type is None
@@ -270,7 +267,6 @@ class ReplicatedLinear(LinearBase):
        input_size: int = None,
        output_size: int = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        skip_quant: bool = False,
        weight_dtype: str = "",
        weight_key: str = "",
@@ -286,7 +282,6 @@ class ReplicatedLinear(LinearBase):
            input_size (int): Number of input features. Defaults to None.
            output_size (int): Number of output features. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
        super().__init__(
@@ -295,7 +290,6 @@ class ReplicatedLinear(LinearBase):
            input_size=input_size,
            output_size=output_size,
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
            weight_dtype=weight_dtype,
            weight_key=weight_key,
@@ -325,7 +319,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
        input_size: int = None,
        output_sizes: list[int] = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        skip_quant: bool = False,
        weight_dtype: str = "",
        weight_key: str = "",
@@ -339,7 +332,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
            input_size (int): Number of input features. Defaults to None.
            output_sizes (list[int]): Number of output features list. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
        super().__init__(
@@ -348,7 +340,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
            input_size=input_size,
            output_size=sum(output_sizes),
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
            weight_dtype=weight_dtype,
            weight_key=weight_key,
@@ -403,7 +394,6 @@ class ColumnParallelLinear(LinearBase):
        input_size: int = None,
        output_size: int = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        skip_quant: bool = False,
        weight_dtype: str = "",
    ):
@@ -417,7 +407,6 @@ class ColumnParallelLinear(LinearBase):
            input_size (int): Number of input features. Defaults to None.
            output_size (int): Number of output features. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
        self.fd_config = fd_config
@@ -432,7 +421,6 @@ class ColumnParallelLinear(LinearBase):
            input_size=self.input_size,
            output_size=self.output_size,
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
            weight_dtype=weight_dtype,
        )
@@ -480,7 +468,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        input_size: int = None,
        output_size: int = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        activation: str = "gelu",
        skip_quant: bool = False,
    ):
@@ -494,7 +481,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            input_size (int): Number of input features. Defaults to None.
            output_size (int): Number of output features. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            activation (str): Activation function to use. Defaults to "gelu".
            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
@@ -510,7 +496,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            input_size=input_size,
            output_size=output_size,
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
        )

@@ -617,7 +602,6 @@ class QKVParallelLinear(ColumnParallelLinear):
        fd_config,
        prefix,
        with_bias=False,
-        add_bias=True,
        num_heads: Optional[int] = None,
        kv_num_heads: Optional[int] = None,
        hidden_size: Optional[int] = None,
@@ -633,7 +617,6 @@ class QKVParallelLinear(ColumnParallelLinear):
            prefix (str): Unique name of the layer, used to name internal attributes.
                Can be arbitrarily named.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to True.
            num_heads (Optional[int]): Number of attention heads in the model.
            kv_num_heads (Optional[int]): Number of key/value heads, used for multi-query or grouped-query attention.
            hidden_size (Optional[int]): Total hidden layer dimension, typically the embedding size.
@@ -661,7 +644,6 @@ class QKVParallelLinear(ColumnParallelLinear):
            input_size=input_size,
            output_size=output_size,
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
            weight_dtype=weight_dtype,
        )
@@ -831,7 +813,6 @@ class RowParallelLinear(LinearBase):
        input_size: int = None,
        output_size: int = None,
        with_bias: bool = False,
-        add_bias: bool = False,
        reduce_results: bool = True,
        skip_quant: bool = False,
        weight_dtype: str = "",
@@ -847,7 +828,6 @@ class RowParallelLinear(LinearBase):
            input_size (int): Number of input features. Defaults to None.
            output_size (int): Number of output features. Defaults to None.
            with_bias (bool): Whether to include bias or not. Defaults to False.
-            add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False.
            skip_quant (bool): Whether to skip quantization. Defaults to False.
        """
        self.fd_config = fd_config
@@ -875,7 +855,6 @@ class RowParallelLinear(LinearBase):
            input_size=self.input_size,
            output_size=self.output_size,
            with_bias=with_bias,
-            add_bias=add_bias,
            skip_quant=skip_quant,
            weight_dtype=weight_dtype,
        )
@@ -896,10 +875,8 @@ class RowParallelLinear(LinearBase):

        self.reduce_results = reduce_results and not self.split_token

-        if add_bias:
-            assert with_bias, "with_bias must be True when add_bias is True."
-            if self.tp_size > 1 and self.reduce_results:
-                set_weight_attrs(self.bias, {"tp_row_bias": True})
+        if self.with_bias and self.tp_size > 1 and self.reduce_results:
+            set_weight_attrs(self.bias, {"tp_row_bias": True})

    def all2all_transpose(self, x: paddle.Tensor) -> paddle.Tensor:
        token_num = x.shape[0]
--- a/fastdeploy/model_executor/layers/quantization/w4afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/w4afp8.py
@@ -107,7 +107,7 @@ class W4AFP8LinearMethod(QuantMethodBase):
            layer.weight,
            layer.weight_scale,
            zero_points=None,
-            bias=layer.bias if layer.add_bias else None,
+            bias=layer.bias if layer.with_bias else None,
            out_scale=self.quant_config.weight_scale_dict.get(layer.prefix + ".weight_scale")
            / (
                self.quant_config.act_scale_dict.get(layer.prefix + ".activation_scale")
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -361,7 +361,7 @@ class WeightOnlyLinearMethod(QuantMethodBase):
        linear_out = weight_only_linear(
            x,
            weight=layer.weight,
-            bias=layer.bias if layer.add_bias else None,
+            bias=layer.bias if layer.with_bias else None,
            weight_scale=layer.weight_scale,
            weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
            arch=self.quant_config.weight_only_linear_arch,
--- a/fastdeploy/model_executor/models/gpt_oss.py
+++ b/fastdeploy/model_executor/models/gpt_oss.py
@@ -67,7 +67,6 @@ class GptOssAttention(nn.Layer):
            input_size=self.num_attention_heads * self.head_dim,
            output_size=self.hidden_size,
            with_bias=True,
-            add_bias=True,
        )

        self.attn = Attention(
--- a/tests/quantization/test_w4afp8.py
+++ b/tests/quantization/test_w4afp8.py
@@ -41,7 +41,7 @@ class TestW4AFP8(unittest.TestCase):
        self.layer.weight_shape = [8, 4]
        self.layer.create_parameter.return_value = "created_weight"
        self.layer.bias = "bias"
-        self.layer.add_bias = True
+        self.layer.with_bias = True
        self.layer._dtype = "float16"
        self.layer.prefix = "layer"

@@ -138,7 +138,7 @@ class TestW4AFP8(unittest.TestCase):

    @mock.patch("fastdeploy.model_executor.ops.gpu.scaled_gemm_f8_i4_f16")
    def test_apply_without_bias(self, mock_gemm):
-        self.layer.add_bias = False
+        self.layer.with_bias = False
        mock_gemm.return_value = "out"
        x = "x"