Sync v2.0 version of code to github repo

2025-09-27 12:52:29 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -17,10 +17,10 @@ from typing import Optional

 import paddle

-import fastdeploy
-from fastdeploy.platforms.utils import convert_to_npu_dequant_scale
-
-from .quant_base import QuantConfigBase, QuantMethodBase
+from fastdeploy.model_executor.layers.quantization.ops import (
+    cutlass_scaled_mm, scaled_fp8_quant)
+from fastdeploy.model_executor.layers.quantization.quant_base import (
+    QuantConfigBase, QuantMethodBase)


 class WFP8AFP8Config(QuantConfigBase):
@@ -32,17 +32,26 @@ class WFP8AFP8Config(QuantConfigBase):
        super().__init__()
        self.weight_scale_dict = weight_scale_dict
        self.act_scale_dict = act_scale_dict
+        self.quant_max_bound = 448
+        self.quant_min_bound = -448
+        self.quant_round_type = 1

-    def get_name(self) -> str:
+    def name(self) -> str:
+        """
+        """
        return "wfp8afp8"

    @classmethod
    def from_config(cls, config: dict) -> "WFP8AFP8Config":
-        weight_scale_dict = config["weight_scale_dict"]
-        act_scale_dict = config["act_scale_dict"]
+        """
+        """
+        weight_scale_dict = config.get("weight_scale_dict", None)
+        act_scale_dict = config.get("act_scale_dict", None)
        return cls(weight_scale_dict, act_scale_dict)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        """
+        """
        return WFP8AFP8LinearMethod(self)


@@ -59,58 +68,49 @@ class WFP8AFP8LinearMethod(QuantMethodBase):
        self.quant_config = quant_config

    def create_weights(self, layer):
+        """
+        """
+        layer.linear_weight_shape.reverse()
+        layer.weight_dtype = "float8_e4m3fn"
        # TODO(YuanRisheng): set weight logic should be moved to process_loaded_weights func
-        weight_scale = self.quant_config.weight_scale_dict.get(
-            layer.prefix + ".weight_quanter")
-        in_scale = self.quant_config.act_scale_dict.get(layer.prefix +
-                                                        ".activation_quanter")
        self.skip_quant = False
-        # we will skip quant if weight_scale is not found or in_scale is not found
-        if weight_scale is None or in_scale is None:
-            self.skip_quant = True
-        else:
-            max_range = 448.0
-            layer.scalar_scale_name = layer.prefix + ".scalar_weight_quanter"
-            layer.scalar_scale = layer.create_parameter(
-                shape=([1]),
-                dtype="float32",
-            )
-            layer.scalar_scale.set_value(
-                paddle.to_tensor([1.0 / (max_range * in_scale)],
-                                 dtype="float32"))
-            linear_out_scale = paddle.to_tensor(weight_scale /
-                                                max_range).astype("float32")
-            layer.linear_out_scale = layer.create_parameter(
-                shape=[layer.embed_dim],
-                dtype="float32",
-                is_bias=False,
-                default_initializer=paddle.nn.initializer.Constant(0),
-            )
-            layer.linear_out_scale.set_value(
-                convert_to_npu_dequant_scale(linear_out_scale))
+        layer.linear_weight_scale = layer.create_parameter(
+            shape=[1],
+            dtype="float32",
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )

    def process_loaded_weights(self, layer, weights) -> None:
-        # TODO(YuanRisheng): We should abstract the ‌skip_quant‌ logic to adapt to more quant methods
+        """
+        """
        if self.skip_quant:
            weight_tensor = weights.cast(layer._dtype)
            layer.linear_weight.set_value(weight_tensor)
            return
-        weight_tensor = weights.transpose([1, 0])
-        weight_tensor = paddle.cast(weight_tensor, self.weight_dtype)
-        self.linear_weight.copy_(weight_tensor, False)
+        if weights.dtype != paddle.float8_e4m3fn:
+            self.use_per_token_if_dynamic = True
+        weight_tensor = weights.transpose([1, 0]).contiguous()
+        qweight, weight_scale = scaled_fp8_quant(
+            weight_tensor,
+            use_per_token_if_dynamic=False,
+        )
+        layer.linear_weight.copy_(qweight, False)
+        layer.linear_weight_scale.set_value(weight_scale)

    def apply(self, layer, x):
+        """
+        """
        if self.skip_quant:
            linear_out = paddle.matmul(x, layer.linear_weight, False, True)
            return linear_out
-        linear_out = fastdeploy.model_executor.ops.gpu.per_channel_fp8_fp8_half_gemm_fused(
-            x,
-            layer.linear_weight,
-            bias=layer.linear_bias if layer.add_bias else None,
-            scalar_scale=layer.scalar_scale,
-            channel_scale=layer.linear_out_scale,
-            transpose_x=False,
-            transpose_y=True,
-            output_dtype=layer._dtype,
-        )
+        if self.use_per_token_if_dynamic:
+            out_type = x.dtype
+            a_q, a_scales = scaled_fp8_quant(
+                x, use_per_token_if_dynamic=self.use_per_token_if_dynamic)
+            linear_out = cutlass_scaled_mm(a_q, layer.linear_weight, a_scales,
+                                           layer.linear_weight_scale, out_type,
+                                           layer.linear_bias)
+        else:
+            raise NotImplementedError
        return linear_out