Revert "【New Feature】W4afp8 supports per group quantization (#4272)" (#4854)

This reverts commit 93fcf7e4ec.
2025-12-24 13:28:13 +08:00 · 2025-11-06 17:48:28 +08:00
parent 3478d20262
commit 819b2dbbae
26 changed files with 1718 additions and 4378 deletions
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -30,10 +30,7 @@ if current_platform.is_cuda():
    from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce

    try:
-        from fastdeploy.model_executor.ops.gpu import (
-            w4afp8_gemm_scale_permute,
-            w4afp8_gemm_weight_convert,
-        )
+        from fastdeploy.model_executor.ops.gpu import w4afp8_gemm_scale_permute
    except:
        logger.warning("import w4afp8_gemm_scale_permute Failed!")
 elif current_platform.is_iluvatar():
@@ -78,7 +75,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
        expert_idx_per_token: paddle.Tensor,
        used_in_ep_low_latency: bool = False,
        estimate_total_token_nums: int = -1,
-        dequant_scale: paddle.Tensor = None,
    ):
        """
        Paddle Cutlass compute Fused MoE.
@@ -104,7 +100,7 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
                token_nums_per_expert,
                getattr(layer, self.added_weight_attrs[0]),
                getattr(layer, self.added_weight_attrs[1]),
-                dequant_scale,
+                # None,
                (layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
                (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
                (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
@@ -116,7 +112,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
                getattr(layer.moe_quant_config, "hadamard_block_size", 128),
                layer.activation,
            )
-
        if layer.with_bias:
            down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
            ffn_out_without_down_proj_bias = paddle.add(ffn_out_without_down_proj_bias, down_proj_bias_expand)
@@ -265,7 +260,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
                topk_weights,
                topk_idx,
                expert_idx_per_token,
-                dequant_scale,
            ) = moe_expert_dispatch(
                x,
                gate_out,
@@ -286,21 +280,19 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
                topk_weights,
                topk_idx,
                expert_idx_per_token,
-                dequant_scale,
            ) = moe_expert_dispatch(
                x,
                gate_out,
                layer.gate_correction_bias,
-                (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
+                (
+                    layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
+                ),  # if set, permute_input will be int8_t
                layer.top_k,
                False,
                self.moe_quant_type,
                topk_only_mode=False,
            )

-        if hasattr(layer, "up_gate_proj_in_scale"):
-            dequant_scale = None
-
        if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8":
            # only w4a8 need expert_idx_per_token
            # Other need not this tensor, so we make it None.
@@ -308,9 +300,7 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
        else:
            expert_idx_per_token = expert_idx_per_token.cast("int64")

-        ffn_out = self.compute_ffn(
-            layer, permute_input, token_nums_per_expert, expert_idx_per_token, False, -1, dequant_scale
-        )
+        ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert, expert_idx_per_token)

        # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
        fused_moe_out = moe_expert_reduce(
@@ -855,7 +845,7 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
            weight_name = self.added_weight_attrs[idx]
            weight_list = []
            for i in range(layer.num_local_experts):
-                quant_weight = w4afp8_gemm_weight_convert(weight_tensor[i])
+                quant_weight, scale = weight_quantize(weight_tensor[i], algo=self.moe_quant_type, arch=80)
                weight_list.append(quant_weight)
            quanted_weight = paddle.stack(weight_list, axis=0)
            getattr(layer, weight_name).set_value(quanted_weight)
@@ -885,29 +875,16 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
            )

        # in_scales
-        if not layer.moe_quant_config.moe_dynamic_quant:
-            for in_scale_name in ["up_gate_proj_in_scale", "down_proj_in_scale"]:
-                setattr(
-                    layer,
-                    in_scale_name,
-                    layer.create_parameter(
-                        shape=[layer.num_local_experts],
-                        dtype="float32",
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ),
-                )
-        else:
-            if layer.ep_size > 1:
-                for in_scale_name in ["up_gate_proj_in_scale"]:
-                    setattr(
-                        layer,
-                        in_scale_name,
-                        layer.create_parameter(
-                            shape=[layer.num_local_experts],
-                            dtype="float32",
-                            default_initializer=paddle.nn.initializer.Constant(0),
-                        ),
-                    )
+        for in_scale_name in ["up_gate_proj_in_scale", "down_proj_in_scale"]:
+            setattr(
+                layer,
+                in_scale_name,
+                layer.create_parameter(
+                    shape=[layer.num_local_experts],
+                    dtype="float32",
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ),
+            )

        # weight_scales
        setattr(
@@ -965,57 +942,10 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
            return weight_scale

        def _process_weight_scale(name: str, weight_scales: list[paddle.Tensor], processed_in_scale: paddle.Tensor):
-            if processed_in_scale is not None:
-                processed_weight_scale = paddle.stack(weight_scales, axis=0) / (448 * 7 * 2 ** (-9))
-                if len(processed_weight_scale.shape) == 3:
-                    processed_weight_scale = (
-                        processed_weight_scale.transpose([0, 2, 1]) / processed_in_scale[:, None, None]
-                    )
-                else:
-                    processed_weight_scale = processed_weight_scale / processed_in_scale[:, None]
-            else:
-                processed_weight_scale = paddle.stack(weight_scales, axis=0) / (448 * 7 * 2 ** (-9))
-
-            if len(processed_weight_scale.shape) == 3:
-                if name == "up_gate_proj_weight_scale" and processed_weight_scale.shape[-1] * 128 != layer.hidden_size:
-                    assert (
-                        layer.hidden_size // 128 % processed_weight_scale.shape[-1] == 0
-                    ), "weight_scale_group_size must be a multiple of 128"
-                    # If it is a multiple of 128, repeat to 128
-                    processed_weight_scale = processed_weight_scale.repeat_interleave(
-                        layer.hidden_size // 128 // processed_weight_scale.shape[-1], axis=-1
-                    )
-                elif (
-                    name == "down_proj_weight_scale"
-                    and processed_weight_scale.shape[-1] * 128 != layer.moe_intermediate_size
-                ):
-                    assert (
-                        layer.moe_intermediate_size // 128 % processed_weight_scale.shape[-1] == 0
-                    ), "weight_scale_group_size must be a multiple of 128"
-                    # If it is a multiple of 128, repeat to 128
-                    processed_weight_scale = processed_weight_scale.repeat_interleave(
-                        layer.moe_intermediate_size // 128 // processed_weight_scale.shape[-1], axis=-1
-                    )
-
-                origin_shape = processed_weight_scale.shape
-                processed_weight_scale = processed_weight_scale.transpose([0, 2, 1])
-                processed_weight_scale = processed_weight_scale.reshape([-1, processed_weight_scale.shape[-1]])
-                processed_weight_scale = _permute_weight_scale(processed_weight_scale)
-                processed_weight_scale = processed_weight_scale.reshape(
-                    [origin_shape[0], origin_shape[2], origin_shape[1] // 128, 128]
-                )
-                processed_weight_scale = processed_weight_scale.transpose([0, 2, 1, 3])
-                setattr(
-                    layer,
-                    name,
-                    layer.create_parameter(
-                        shape=processed_weight_scale.shape,
-                        dtype="float32",
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ),
-                )
-            else:
-                processed_weight_scale = _permute_weight_scale(processed_weight_scale)
+            processed_weight_scale = (
+                paddle.stack(weight_scales, axis=0) / (448 * 7 * 2 ** (-9)) / processed_in_scale[:, None]
+            )
+            processed_weight_scale = _permute_weight_scale(processed_weight_scale)
            getattr(layer, name).set_value(processed_weight_scale)

        # 1. Init scale containers and maps
@@ -1062,15 +992,16 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod):
                scale_tensor = _extract_scale_tensor(layer, state_dict, scale_key_template, expert_idx)
                scale_weight_map[name].append(scale_tensor)

+        # 3. Process scale tensor and set to layer
+        in_scales = []
+        for in_scale_name in ["up_gate_proj_in_scale", "down_proj_in_scale"]:
+            in_scales.append(_process_in_scale(in_scale_name, scale_weight_map[in_scale_name]))
+
        for i, weight_scale_name in enumerate(["up_gate_proj_weight_scale", "down_proj_weight_scale"]):
-            in_scale_name = weight_scale_name.replace("_weight_scale", "_in_scale")
-            in_scale = None
-            if hasattr(layer, in_scale_name) and in_scale_name in scale_weight_map.keys():
-                in_scale = _process_in_scale(in_scale_name, scale_weight_map[in_scale_name])
            _process_weight_scale(
                weight_scale_name,
                scale_weight_map[weight_scale_name],
-                in_scale,
+                in_scales[i],
            )


--- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py
@@ -275,7 +275,6 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
            topk_weights,
            topk_idx,
            expert_idx_per_token,
-            dequant_scale,
        ) = moe_expert_dispatch(
            x,
            gate_out,
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -39,7 +39,6 @@ class MixQuantConfig(QuantConfigBase):
        is_permuted: bool = True,
        is_quantized: bool = False,
        hadamard_block_size: int = 128,
-        moe_dynamic_quant: bool = False,
    ) -> None:
        super().__init__()
        self.dense_quant_type = dense_quant_type
@@ -58,7 +57,6 @@ class MixQuantConfig(QuantConfigBase):
        self.is_checkpoint_bf16 = not is_quantized
        self.is_quantized = is_quantized
        self.hadamard_block_size = hadamard_block_size
-        self.moe_dynamic_quant = moe_dynamic_quant

    def name(self) -> str:
        return "mix_quant"
@@ -75,7 +73,6 @@ class MixQuantConfig(QuantConfigBase):
            config.get("is_permuted", True),
            config.get("is_quantized", False),
            config.get("hadamard_block_size", 128),
-            config.get("moe_dynamic_quant", False),
        )

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]: