polish code with new pre-commit rule (#2923)

2025-10-08 18:11:00 +08:00 · 2025-07-19 23:19:27 +08:00
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions
--- a/fastdeploy/model_executor/layers/quantization/ops/init.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/init.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 from .cutlass_scaled_mm import cutlass_scaled_mm
 from .scaled_fp8_quant import scaled_fp8_quant

--- a/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 from typing import Optional

 import paddle
@@ -20,12 +21,14 @@ import paddle
 import fastdeploy


-def cutlass_scaled_mm(a: paddle.Tensor,
-                      b: paddle.Tensor,
-                      scale_a: paddle.Tensor,
-                      scale_b: paddle.Tensor,
-                      out_dtype: paddle.dtype,
-                      bias: Optional[paddle.Tensor] = None) -> paddle.Tensor:
+def cutlass_scaled_mm(
+    a: paddle.Tensor,
+    b: paddle.Tensor,
+    scale_a: paddle.Tensor,
+    scale_b: paddle.Tensor,
+    out_dtype: paddle.dtype,
+    bias: Optional[paddle.Tensor] = None,
+) -> paddle.Tensor:
    """
    `cutlass_scaled_mm` implements a fused version of
        `output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
@@ -48,9 +51,8 @@ def cutlass_scaled_mm(a: paddle.Tensor,
        scale_a.shape * [1, 128] == a.shape
        scale_b.shape * [128, 128] == b.shape
    """
-    assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16)
-    assert bias is None or bias.shape[0] == b.shape[
-        0] and bias.dtype == out_dtype
+    assert out_dtype == paddle.bfloat16 or out_dtype == paddle.float16
+    assert bias is None or bias.shape[0] == b.shape[0] and bias.dtype == out_dtype
    # Ensure input tensors have valid shapes
    # assert a.numel() > 0, "Input tensor 'a' must not be empty"
    # assert b.numel() > 0, "Input tensor 'b' must not be empty"
@@ -59,12 +61,11 @@ def cutlass_scaled_mm(a: paddle.Tensor,

    m = a.shape[0]
    n = b.shape[0]
-    cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
+    cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
    assert cutlass_compatible_b

    out = paddle.empty([m, n], dtype=out_dtype)
-    fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(
-        out, a, b, scale_a, scale_b, bias)
+    fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)

    return out

@@ -100,7 +101,7 @@ def scaled_fp8_quant(
            scaling factor.
    """
    # This code assumes batch_dim and num_tokens are flattened
-    assert (input.ndim == 2)
+    assert input.ndim == 2
    shape = input.shape
    if num_token_padding:
        shape = (max(num_token_padding, input.shape[0]), shape[1])
@@ -109,18 +110,21 @@ def scaled_fp8_quant(
    if scale is None:
        if use_per_token_if_dynamic:
            scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
-            from fastdeploy.model_executor.ops.gpu import \
-                dynamic_per_token_scaled_fp8_quant
+            from fastdeploy.model_executor.ops.gpu import (
+                dynamic_per_token_scaled_fp8_quant,
+            )
+
            dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
        else:
            scale = paddle.zeros([1], dtype=paddle.float32)
-            from fastdeploy.model_executor.ops.gpu import \
-                dynamic_scaled_fp8_quant
+            from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
+
            dynamic_scaled_fp8_quant(output, input, scale)
    else:
        # num_token_padding not implemented for this case
        # assert (scale.numel() == 1 or num_token_padding is None)
        from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
+
        static_scaled_fp8_quant(output, input, scale)

    return output, scale
--- a/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 from typing import Optional

 import paddle
@@ -49,7 +50,7 @@ def scaled_fp8_quant(
            scaling factor.
    """
    # This code assumes batch_dim and num_tokens are flattened
-    assert (input.ndim == 2)
+    assert input.ndim == 2
    shape = input.shape
    if num_token_padding:
        shape = (max(num_token_padding, input.shape[0]), shape[1])
@@ -58,18 +59,21 @@ def scaled_fp8_quant(
    if scale is None:
        if use_per_token_if_dynamic:
            scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
-            from fastdeploy.model_executor.ops.gpu import \
-                dynamic_per_token_scaled_fp8_quant
+            from fastdeploy.model_executor.ops.gpu import (
+                dynamic_per_token_scaled_fp8_quant,
+            )
+
            dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
        else:
            scale = paddle.zeros([1], dtype=paddle.float32)
-            from fastdeploy.model_executor.ops.gpu import \
-                dynamic_scaled_fp8_quant
+            from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
+
            dynamic_scaled_fp8_quant(output, input, scale)
    else:
        # num_token_padding not implemented for this case
        # assert (scale.numel() == 1 or num_token_padding is None)
        from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
+
        static_scaled_fp8_quant(output, input, scale)

    return output, scale