polish code with new pre-commit rule (#2923)

2025-10-28 10:51:39 +08:00 · 2025-07-19 23:19:27 +08:00
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions
--- a/fastdeploy/model_executor/layers/backends/init.py
+++ b/fastdeploy/model_executor/layers/backends/init.py
@@ -22,24 +22,29 @@ __all__ = []

 if current_platform.is_xpu():
    from . import xpu
-    from .xpu import *
-    if hasattr(xpu, '__all__'):
+
+    # fix: F403 `from .xpu import *` used; unable to detect undefined names
+    if hasattr(xpu, "__all__"):
+        globals().update({name: getattr(xpu, name) for name in xpu.__all__})
        __all__.extend(xpu.__all__)

 if current_platform.is_npu():
    from . import npu
-    from .npu import *
-    if hasattr(npu, '__all__'):
+
+    if hasattr(npu, "__all__"):
+        globals().update({name: getattr(npu, name) for name in npu.__all__})
        __all__.extend(npu.__all__)

 if current_platform.is_gcu():
    from . import gcu
-    from .gcu import *
-    if hasattr(gcu, '__all__'):
+
+    if hasattr(gcu, "__all__"):
+        globals().update({name: getattr(gcu, name) for name in gcu.__all__})
        __all__.extend(gcu.__all__)

 if current_platform.is_dcu():
-    from .dcu import *
    from . import dcu
-    if hasattr(dcu, '__all__'):
-        __all__.extend(dcu.__all__)
+
+    if hasattr(dcu, "__all__"):
+        globals().update({name: getattr(dcu, name) for name in dcu.__all__})
+        __all__.extend(dcu.__all__)
--- a/fastdeploy/model_executor/layers/backends/dcu/init.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/init.py
@@ -19,4 +19,4 @@ dcu backend methods
 from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
 from .weight_only import DCUWeightOnlyLinearMethod

-__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod']
+__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod"]
--- a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py
@@ -17,10 +17,8 @@
 import paddle
 from paddle import nn

-from fastdeploy.distributed.communication_op import \
-    tensor_model_parallel_all_reduce
-from fastdeploy.model_executor.layers.quantization.quant_base import \
-    QuantMethodBase
+from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
 from fastdeploy.utils import ceil_div


@@ -36,7 +34,8 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
        self.quant_method = quant_method
        self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
        self.added_scale_attrs = [
-            "up_gate_proj_weight_scale", "down_proj_weight_scale"
+            "up_gate_proj_weight_scale",
+            "down_proj_weight_scale",
        ]

    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
@@ -52,10 +51,12 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
        assert len(down_proj_weights) == layer.num_local_experts
        assert self.quant_method.name() == "wint8"
        assert up_gate_proj_weights[0].shape == [
-            layer.hidden_size, layer.moe_intermediate_size * 2
+            layer.hidden_size,
+            layer.moe_intermediate_size * 2,
        ]
        assert down_proj_weights[0].shape == [
-            layer.moe_intermediate_size, layer.hidden_size
+            layer.moe_intermediate_size,
+            layer.hidden_size,
        ]

        up_gate_proj_tensor = paddle.stack(up_gate_proj_weights, axis=0)
@@ -71,26 +72,29 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
            scale_name = self.added_scale_attrs[idx]

            quanted_weight_scale = weight_tensor.abs().max(axis=1)
-            quanted_weight = weight_tensor / quanted_weight_scale[:,
-                                                                  None, :] * max_bound
+            quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound
            quanted_weight = paddle.round(quanted_weight).astype("int8")
            quanted_weight_scale = quanted_weight_scale / max_bound

            setattr(
-                layer, weight_name,
+                layer,
+                weight_name,
                layer.create_parameter(
                    shape=quanted_weight.shape,
                    dtype=quanted_weight.dtype,
                    default_initializer=paddle.nn.initializer.Constant(0),
-                ))
+                ),
+            )
            getattr(layer, weight_name).set_value(quanted_weight)

            setattr(
-                layer, scale_name,
+                layer,
+                scale_name,
                layer.create_parameter(
                    shape=quanted_weight_scale.shape,
                    dtype=quanted_weight_scale.dtype,
-                ))
+                ),
+            )
            getattr(layer, scale_name).set_value(quanted_weight_scale)

    def apply(
@@ -112,10 +116,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
        scores = paddle.nn.functional.softmax(gate_out, axis=-1)
        scores += layer.gate_correction_bias
-        topk_weights, topk_ids = paddle.topk(scores,
-                                             k=top_k,
-                                             axis=-1,
-                                             sorted=False)
+        topk_weights, topk_ids = paddle.topk(scores, k=top_k, axis=-1, sorted=False)
        topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)

        intermediate_cache1 = paddle.empty(
@@ -140,11 +141,15 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
        from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess

        from .triton_moe_kernels import fused_moe_kernel_paddle
+
        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
-            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"]
+        )
        max_num_tokens_padded = sorted_token_ids.shape[0]
-        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
-                ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
+        grid = (
+            ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"])
+            * ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]),
+        )

        fused_moe_kernel_paddle[grid](
            x,
@@ -188,11 +193,11 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase):
            even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
        )

-        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
-            intermediate_cache1)
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)

-        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
-                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+        grid = (
+            ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
+        )
        fused_moe_kernel_paddle[grid](
            intermediate_cache2,
            layer.down_proj_weight,
--- a/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py
@@ -29,7 +29,6 @@ def fused_moe_kernel_paddle(
    sorted_token_ids_ptr,
    expert_ids_ptr,
    num_tokens_post_padded_ptr,
-
    # Matrix dimensions
    N,
    K,
@@ -108,16 +107,13 @@ def fused_moe_kernel_paddle(

    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
-                      offs_k[None, :] * stride_ak)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak)

    off_experts = tl.load(expert_ids_ptr + pid_m)
-    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
-                                                offs_bn[None, :] * stride_bn)
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)

    if use_int8_w8a16:
-        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
-            None, :] * stride_bsn
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
        b_scale = tl.load(b_scale_ptrs)

    if use_fp8_w8a8:
@@ -139,19 +135,14 @@ def fused_moe_kernel_paddle(
                mask=token_mask[:, None],
                other=0.0,
            )
-            b = tl.load(b_ptrs,
-                        cache_modifier=".cv",
-                        eviction_policy='evict_first')
+            b = tl.load(b_ptrs, cache_modifier=".cv", eviction_policy="evict_first")
        else:
            a = tl.load(
                a_ptrs,
-                mask=token_mask[:, None] &
-                (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                other=0.0,
            )
-            b = tl.load(b_ptrs,
-                        mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                        other=0.0)
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)

        # We accumulate along the K dimension.
        if use_int8_w8a16:
@@ -160,13 +151,14 @@ def fused_moe_kernel_paddle(
            if group_k > 0 and group_n > 0:
                k_start = k * BLOCK_SIZE_K
                offs_ks = k_start // group_k
-                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
-                                  mask=token_mask,
-                                  other=0.0)
+                a_scale = tl.load(
+                    a_scale_ptrs + offs_ks * stride_ask,
+                    mask=token_mask,
+                    other=0.0,
+                )
                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)

-                accumulator += tl.dot(a, b) * a_scale[:,
-                                                      None] * b_scale[None, :]
+                accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
            else:
                accumulator = tl.dot(a, b, acc=accumulator)
        else:
@@ -176,9 +168,7 @@ def fused_moe_kernel_paddle(
        b_ptrs += BLOCK_SIZE_K * stride_bk

    if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token,
-                             mask=token_mask,
-                             other=0)
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
        accumulator = accumulator * moe_weight[:, None]
    if use_int8_w8a16:
        accumulator = (accumulator * b_scale).to(compute_type)
@@ -191,8 +181,7 @@ def fused_moe_kernel_paddle(
        accumulator = accumulator.to(compute_type)
    # Write back the block of the output
    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
-        None, :]
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)

    tl.store(c_ptrs, accumulator, mask=c_mask)
--- a/fastdeploy/model_executor/layers/backends/dcu/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/dcu/weight_only.py
@@ -13,11 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 import paddle
 from paddle.nn.quant import weight_dequantize

 from fastdeploy.model_executor.layers.quantization.weight_only import (
-    GPUWeightOnlyLinearMethod, WeightOnlyConfig)
+    GPUWeightOnlyLinearMethod,
+    WeightOnlyConfig,
+)


 class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod):
@@ -38,7 +41,7 @@ class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod):
            x=layer.weight,
            scale=layer.weight_scale,
            algo=self.quant_config.algo,
-            out_dtype=paddle.get_default_dtype()
+            out_dtype=paddle.get_default_dtype(),
        )
        linear_out = paddle.matmul(x, dequant_out)
        if layer.bias is not None:
--- a/fastdeploy/model_executor/layers/backends/gcu/init.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/init.py
@@ -18,14 +18,13 @@ gcu backend methods

 from .attention.flash_attn_backend import GCUFlashAttnBackend
 from .attention.mem_efficient_attn_backend import GCUMemEfficientAttnBackend
-from .moe.fused_moe_method_gcu_backend import (GCUFusedMoeMethod,
-                                               GCUWeightOnlyMoEMethod)
+from .moe.fused_moe_method_gcu_backend import GCUFusedMoeMethod, GCUWeightOnlyMoEMethod
 from .quantization.weight_only import GCUWeightOnlyLinearMethod

 __all__ = [
-    'GCUFlashAttnBackend',
-    'GCUMemEfficientAttnBackend',
-    'GCUFusedMoeMethod',
-    'GCUWeightOnlyMoEMethod',
-    'GCUWeightOnlyLinearMethod',
+    "GCUFlashAttnBackend",
+    "GCUMemEfficientAttnBackend",
+    "GCUFusedMoeMethod",
+    "GCUWeightOnlyMoEMethod",
+    "GCUWeightOnlyLinearMethod",
 ]
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -17,31 +17,33 @@
 from __future__ import annotations

 import os
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional

-import paddle
-
 import numpy as np
+import paddle

 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.attention.attention import Attention
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
-    AttentionBackend, AttentionMetadata)
+    AttentionBackend,
+    AttentionMetadata,
+)
+
 if TYPE_CHECKING:
    from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode

-from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
-                                               mem_efficient_attention,
-                                               flash_attn_var_len)
 from paddleformers.utils.log import logger

+from fastdeploy.model_executor.ops.gcu import flash_attn_var_len, fused_rotary_embedding
+

@dataclass
 class GCUFlashAttnMetadata(AttentionMetadata):
    """
    GCUFlashAttnMetadata
    """
+
    forward_mode: ForwardMode = ForwardMode.MIXED

    _dtype: paddle.dtype = paddle.bfloat16
@@ -63,15 +65,18 @@ class GCUFlashAttnMetadata(AttentionMetadata):
    pre_caches_length: int = 0


-
-
 class GCUFlashAttnBackend(AttentionBackend):
    """
    GCUFlashAttnBackend backend implementation.
    """

-    def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
-                 head_dim: int):
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        kv_num_heads: int,
+        num_heads: int,
+        head_dim: int,
+    ):
        """
        GCUFlashAttnBackend __init__
        """
@@ -99,8 +104,6 @@ class GCUFlashAttnBackend(AttentionBackend):
        self.rotary_embs = None
        self.enable_monitor: bool = bool(os.getenv("FD_GCU_ATTN_MONITOR", False))

-
-
    def init_attention_metadata(self, forward_meta: ForwardMeta):
        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
        metadata = GCUFlashAttnMetadata()
@@ -131,15 +134,14 @@ class GCUFlashAttnBackend(AttentionBackend):
            self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim))

        # some info for attention
-        self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
-        self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
-        self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
+        self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist()  # List[int]
+        self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist()  # List[List[int]]
+        self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist()  # List[List[int]]
        self.seq_lens_sum = np.sum(self.seq_lens_this_time_list)
        self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list)

        num_seqs = forward_meta.seq_lens_this_time.shape[0]

-
        self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list)
        self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list)

@@ -147,8 +149,14 @@ class GCUFlashAttnBackend(AttentionBackend):
        if self.all_slot_mapping is None:
            max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size
            total_blocks = max_num_blocks_per_seq * self.max_num_seqs
-            self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist()
-            self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
+            self.all_block_tables = (
+                np.arange(0, total_blocks, dtype=np.int32)
+                .reshape((self.max_num_seqs, max_num_blocks_per_seq))
+                .tolist()
+            )
+            self.all_slot_mapping = (
+                np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
+            )

        block_tables = []
        slot_mapping = []
@@ -157,9 +165,9 @@ class GCUFlashAttnBackend(AttentionBackend):
        position_ids = []
        for seq_idx in range(num_seqs):
            cache_len = None
-            if self.seq_lens_encoder_list[seq_idx][0] != 0:   # prefill
+            if self.seq_lens_encoder_list[seq_idx][0] != 0:  # prefill
                cache_len = 0
-            elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
+            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
            # else:  doesnot have req in this seq_idx

@@ -193,7 +201,6 @@ class GCUFlashAttnBackend(AttentionBackend):
        self.max_seqlen_q = self.max_seq_len_this_time
        self.max_seqlen_k = np.max(cache_lens)

-
    def get_attntion_meta(self):
        """get_attntion_meta"""
        return self.attention_metadata
@@ -206,9 +213,11 @@ class GCUFlashAttnBackend(AttentionBackend):
        Caculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
-        return (max_num_blocks * self.block_size,
-                self.kv_num_heads,
-                self.head_dim)
+        return (
+            max_num_blocks * self.block_size,
+            self.kv_num_heads,
+            self.head_dim,
+        )

    @paddle.no_grad()
    def forward_mixed(
@@ -232,7 +241,6 @@ class GCUFlashAttnBackend(AttentionBackend):
        query = query.reshape_((1, -1, self.num_heads, self.head_dim))
        key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim))

-
        # 1. Rope
        if self.rotary_embs.dtype != query.dtype:
            self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype)
@@ -242,7 +250,7 @@ class GCUFlashAttnBackend(AttentionBackend):
            key,
            self.rotary_embs,
            self.position_ids,
-            layer.use_neox_rotary_style
+            layer.use_neox_rotary_style,
        )

        # 2. Save kv cache
@@ -281,4 +289,3 @@ class GCUFlashAttnBackend(AttentionBackend):
        )
        res = res.reshape_((token_num, -1))
        return res
-
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -16,33 +16,35 @@

 from __future__ import annotations

-import os
-from dataclasses import dataclass, field
+import math
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, List, Optional

-import paddle
-
 import numpy as np
-import math
+import paddle
+from paddleformers.utils.log import logger

 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.attention.attention import Attention
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
-    AttentionBackend, AttentionMetadata)
-
-from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
-                                               mem_efficient_attention,
-                                               flash_attn_var_len)
-from paddleformers.utils.log import logger
+    AttentionBackend,
+    AttentionMetadata,
+)
+from fastdeploy.model_executor.ops.gcu import (
+    fused_rotary_embedding,
+    mem_efficient_attention,
+)

 if TYPE_CHECKING:
    from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode

+
@dataclass
 class GCUMemEfficientAttnMetadata(AttentionMetadata):
    """
    GCUMemEfficientAttnMetadata
    """
+
    forward_mode: ForwardMode = ForwardMode.MIXED
    _dtype: paddle.dtype = paddle.bfloat16

@@ -63,15 +65,18 @@ class GCUMemEfficientAttnMetadata(AttentionMetadata):
    pre_caches_length: int = 0


-
-
 class GCUMemEfficientAttnBackend(AttentionBackend):
    """
    GCUMemEfficientAttnBackend backend implementation.
    """

-    def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
-                 head_dim: int):
+    def __init__(
+        self,
+        fd_config: FDConfig,
+        kv_num_heads: int,
+        num_heads: int,
+        head_dim: int,
+    ):
        """
        GCUMemEfficientAttnBackend __init__
        """
@@ -99,8 +104,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        self.rotary_embs = None
        self.use_paddle_native_sdpa = False

-
-
    def init_attention_metadata(self, forward_meta: ForwardMeta):
        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
        metadata = GCUMemEfficientAttnMetadata()
@@ -125,32 +128,35 @@ class GCUMemEfficientAttnBackend(AttentionBackend):

        metadata.pre_caches_length = forward_meta.pre_caches_length  # not inited

-
        self.attention_metadata = metadata

        if self.rotary_embs is None:
            self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim))

        # some info for attention
-        self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int]
-        self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]]
-        self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]]
+        self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist()  # List[int]
+        self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist()  # List[List[int]]
+        self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist()  # List[List[int]]
        self.seq_lens_sum = np.sum(self.seq_lens_this_time_list)
        self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list)

        num_seqs = forward_meta.seq_lens_this_time.shape[0]

-
        self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list)
        self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list)

-
        # block_tables and slot_mapping
        if self.all_slot_mapping is None:
            max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size
            total_blocks = max_num_blocks_per_seq * self.max_num_seqs
-            self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist()
-            self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
+            self.all_block_tables = (
+                np.arange(0, total_blocks, dtype=np.int32)
+                .reshape((self.max_num_seqs, max_num_blocks_per_seq))
+                .tolist()
+            )
+            self.all_slot_mapping = (
+                np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist()
+            )

        block_tables = []
        slot_mapping = []
@@ -162,9 +168,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        position_ids = []
        for seq_idx in range(num_seqs):
            cache_len = None
-            if self.seq_lens_encoder_list[seq_idx][0] != 0:   # prefill
+            if self.seq_lens_encoder_list[seq_idx][0] != 0:  # prefill
                cache_len = 0
-            elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode
+            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
            # else:  doesnot have req in this seq_idx

@@ -179,9 +185,12 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
                position_ids.extend(self.position_ids_base[start:end])
                query_lens.append(lens_this_time)
                cached_kv_lens.append(end)
-                cached_kv_slot_range.append([self.all_slot_mapping[seq_idx][0], self.all_slot_mapping[seq_idx][end]])
-
-
+                cached_kv_slot_range.append(
+                    [
+                        self.all_slot_mapping[seq_idx][0],
+                        self.all_slot_mapping[seq_idx][end],
+                    ]
+                )

        self.block_tables = paddle.to_tensor(block_tables, dtype="int32")
        self.slot_mapping = paddle.to_tensor(slot_mapping, dtype="int32")
@@ -206,7 +215,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        self.cached_kv_lens = cached_kv_lens
        self.cached_kv_slot_range = cached_kv_slot_range

-
    def get_attntion_meta(self):
        """get_attntion_meta"""
        return self.attention_metadata
@@ -219,9 +227,11 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        Caculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
-        return (max_num_blocks * self.block_size,
-                self.kv_num_heads,
-                self.head_dim)
+        return (
+            max_num_blocks * self.block_size,
+            self.kv_num_heads,
+            self.head_dim,
+        )

    @paddle.no_grad()
    def forward_mixed(
@@ -245,7 +255,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        query = query.reshape_((1, -1, self.num_heads, self.head_dim))
        key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim))

-
        # 1. Rope
        if self.rotary_embs.dtype != query.dtype:
            self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype)
@@ -255,7 +264,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
            key,
            self.rotary_embs,
            self.position_ids,
-            layer.use_neox_rotary_style
+            layer.use_neox_rotary_style,
        )

        # 2. Save kv cache
@@ -282,9 +291,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
            v_ = value_caches[kv_start:kv_end, :, :]

            if self.use_paddle_native_sdpa:
-                res = self.native_sdpa_impl(
-                    q_, k_, v_
-                )
+                res = self.native_sdpa_impl(q_, k_, v_)
            else:
                res = mem_efficient_attention(
                    query=q_.unsqueeze(0),
@@ -302,7 +309,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        result = result.reshape_((token_num, -1))
        return result

-
    def get_triangle_upper_mask(self, shape, dtype):
        #  [batch_size, 1, q_seq_len, kv_seq_len]
        shape[1] = 1
@@ -313,7 +319,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        mask = paddle.triu(mask, diagonal=kv_seq_len - q_seq_len + 1)
        return mask

-
    def native_sdpa_impl(self, query, key, value):
        # input shape: [num_tokens, num_heads, head_dim] -> [1, num_tokens, num_heads, head_dim]
        q = query.unsqueeze(0)
@@ -342,13 +347,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        # matmul and devide by sqrt(head_dim)
        attn_weights = paddle.matmul(q / math.sqrt(head_dim), k.transpose([0, 1, 3, 2]))

-        attention_mask = self.get_triangle_upper_mask(
-            [batch, 1, q_seq_len, kv_seq_len], q.dtype
-        )
+        attention_mask = self.get_triangle_upper_mask([batch, 1, q_seq_len, kv_seq_len], q.dtype)
        attn_weights = attn_weights + attention_mask
-        attn_weights = paddle.nn.functional.softmax(
-            attn_weights, axis=-1, dtype="float32"
-        ).astype(q.dtype)
+        attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1, dtype="float32").astype(q.dtype)

        attn_output = paddle.matmul(attn_weights, v)
        attn_output = attn_output.transpose([0, 2, 1, 3])
--- a/fastdeploy/model_executor/layers/backends/gcu/moe/init.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/moe/init.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""""
+""" "
 gcu moe
 """
--- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py
@@ -1,4 +1,3 @@
-
 """
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
@@ -15,7 +14,6 @@
 # limitations under the License.
 """

-
 import multiprocessing
 import os

@@ -24,27 +22,30 @@ import paddle
 from paddle import nn
 from paddleformers.utils.log import logger

-from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import \
-    MoEMethodBase
-from fastdeploy.model_executor.layers.utils import (CpuGuard,
-                                                    create_and_set_parameter,
-                                                    get_tensor)
-from fastdeploy.model_executor.ops.gcu import (invoke_fused_moe_kernel,
-                                               moe_align_block_size,
-                                               topk_softmax,
-                                               weight_quantize_custom_rtn,
-                                               weight_quantize_rtn)
+from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
+from fastdeploy.model_executor.layers.utils import (
+    CpuGuard,
+    create_and_set_parameter,
+    get_tensor,
+)
+from fastdeploy.model_executor.ops.gcu import (
+    invoke_fused_moe_kernel,
+    moe_align_block_size,
+    topk_softmax,
+    weight_quantize_custom_rtn,
+    weight_quantize_rtn,
+)


 class GCUFusedMoeMethod(MoEMethodBase):
    """
    Use GCU to compute Fused MoE.
    """
+
    def __init__(self, quant_config):
        super().__init__(quant_config)
        self.group_size = -1

-
    def create_weights(self, layer: nn.Layer, state_dict):
        """
        Paddle gcu create weight process.
@@ -53,28 +54,28 @@ class GCUFusedMoeMethod(MoEMethodBase):
        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
        stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
        stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
-        for idx, weight_tensor in enumerate(
-            [stacked_up_gate_proj_weights, stacked_down_proj_weights]):
+        for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]):
            # shape [E, K, N] -> [E, N, K]
            weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1])
            weight_name = self.added_weight_attrs[idx]
            setattr(
-                layer, weight_name,
+                layer,
+                weight_name,
                layer.create_parameter(
                    shape=weight_tensor.shape,
                    dtype=weight_tensor.dtype,
                    default_initializer=paddle.nn.initializer.Constant(0),
-                ))
+                ),
+            )
            getattr(layer, weight_name).set_value(weight_tensor)

-
    @paddle.no_grad()
    def compute_ffn(
        self,
        layer: nn.Layer,
        x: paddle.Tensor,
        gate_out: paddle.Tensor,
-        enable_quant = False
+        enable_quant=False,
    ) -> paddle.Tensor:
        """
        Paddle gcu compute Fused MoE.
@@ -86,8 +87,17 @@ class GCUFusedMoeMethod(MoEMethodBase):

        topk_weights = paddle.empty([token_num, top_k], dtype=gate_out.dtype)
        topk_indices = paddle.empty([token_num, top_k], dtype="int32")
-        token_expert_indices = paddle.empty([token_num, top_k], dtype="int32",)
-        topk_softmax(topk_weights, topk_indices, token_expert_indices, gate_out, norm_topk_prob=True)
+        token_expert_indices = paddle.empty(
+            [token_num, top_k],
+            dtype="int32",
+        )
+        topk_softmax(
+            topk_weights,
+            topk_indices,
+            token_expert_indices,
+            gate_out,
+            norm_topk_prob=True,
+        )

        config = {
            "BLOCK_SIZE_M": 32,
@@ -136,7 +146,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
            top_k,
            config,
            enable_quant,  # use_int4_w4a16
-             [0, self.group_size],  # block_shape
+            [0, self.group_size],  # block_shape
        )

        intermediate_cache2 = paddle.empty(
@@ -144,8 +154,7 @@ class GCUFusedMoeMethod(MoEMethodBase):
            dtype=x.dtype,
        )

-        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
-            intermediate_cache1)
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)

        intermediate_cache2 = intermediate_cache2.reshape([-1, moe_intermediate_size])

@@ -181,13 +190,14 @@ class GCUFusedMoeMethod(MoEMethodBase):
        fused_moe_out = fused_moe_out.reshape_([token_num, hidden_size])

        if layer.tp_size > 1:
-            from fastdeploy.distributed.communication_op import \
-                tensor_model_parallel_all_reduce
+            from fastdeploy.distributed.communication_op import (
+                tensor_model_parallel_all_reduce,
+            )
+
            tensor_model_parallel_all_reduce(fused_moe_out)

        return fused_moe_out

-
    def apply(
        self,
        layer: nn.Layer,
@@ -199,7 +209,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
        """
        return self.compute_ffn(layer, x, gate_out, enable_quant=False)

-
    def apply_ep_prefill(
        self,
        layer: nn.Layer,
@@ -211,7 +220,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
        """
        raise NotImplementedError

-
    def apply_ep_decode(
        self,
        layer: nn.Layer,
@@ -223,7 +231,6 @@ class GCUFusedMoeMethod(MoEMethodBase):
        """
        raise NotImplementedError

-
    def apply_tp(
        self,
        layer: nn.Layer,
@@ -247,48 +254,44 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
        self.moe_quant_type = self.quant_config.algo
        self.pack_num = 1

-        assert self.quant_config.algo == "weight_only_int4", \
-            "GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}"
+        assert (
+            self.quant_config.algo == "weight_only_int4"
+        ), "GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}"

        self.added_qzeros_attrs = [
-            "up_gate_proj_weight_zeros", "down_proj_weight_zeros"
+            "up_gate_proj_weight_zeros",
+            "down_proj_weight_zeros",
        ]
        self.group_size = 64

-        self.quant_multi_process_group_size = int(
-            os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8)
-        )
+        self.quant_multi_process_group_size = int(os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8))
        logger.info(f"GCUWeightOnlyMoEMethod quant_multi_process_group_size: {self.quant_multi_process_group_size}")

-
    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
        """
        Paddle gcu process prequanted weights.
        """
-        up_gate_proj_expert_weight_key = layer.weight_key_map.get(
-            "up_gate_proj_expert_weight_key", None)
-        down_proj_expert_weight_key = layer.weight_key_map.get(
-            "down_proj_expert_weight_key", None)
-        up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get(
-            "up_gate_proj_expert_weight_scale_key", None)
-        down_proj_expert_weight_scale_key = layer.weight_key_map.get(
-            "down_proj_expert_weight_scale_key", None)
+        up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None)
+        down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None)
+        up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None)
+        down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)

        up_gate_proj_weights, down_proj_weights = layer.load_experts_weight(
-            state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key)
+            state_dict,
+            up_gate_proj_expert_weight_key,
+            down_proj_expert_weight_key,
+        )
        # self.check(layer, up_gate_proj_weights, down_proj_weights)
        up_gate_proj_weight_scale = []
        down_proj_weight_scale = []
        for i in range(layer.num_experts):
            expert_idx = layer.expert_id_offset + i
            up_gate_proj_weight_scale.append(
-                get_tensor(
-                    state_dict.pop(
-                        up_gate_proj_expert_weight_scale_key.format(expert_idx))))
+                get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
+            )
            down_proj_weight_scale.append(
-                get_tensor(
-                    state_dict.pop(
-                        down_proj_expert_weight_scale_key.format(expert_idx))))
+                get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx)))
+            )

        up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
        down_proj_weight = paddle.stack(down_proj_weights, axis=0)
@@ -299,12 +302,11 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
            "up_gate_proj_weight": up_gate_proj_weight,
            "down_proj_weight": down_proj_weight,
            "up_gate_proj_weight_scale": up_gate_proj_weight_scale,
-            "down_proj_weight_scale": down_proj_weight_scale
+            "down_proj_weight_scale": down_proj_weight_scale,
        }
        for name, tensor in name_tensor_map.items():
            create_and_set_parameter(layer, name, tensor)

-
    @paddle.no_grad()
    def create_weights(self, layer: nn.Layer, state_dict):
        """
@@ -313,7 +315,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
        up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict)
        self.check(layer, up_gate_proj_weights, down_proj_weights)

-
        def quant_worker(p_group_idx, shared_dict, weights, moe_quant_type, group_size):
            with CpuGuard():
                p_group_size = len(weights)
@@ -322,13 +323,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
                    quant_weight, scale = weight_quantize_custom_rtn(
                        weights[group_j],
                        moe_quant_type,
-                        group_size # group_size
+                        group_size,  # group_size
                    )
                    shared_dict[p_group_size * p_group_idx + group_j] = (
-                        quant_weight, scale
+                        quant_weight,
+                        scale,
                    )

-
        for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
            weight_name = self.added_weight_attrs[idx]
            scale_name = self.added_scale_attrs[idx]
@@ -354,7 +355,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):

                        p = multiprocessing.Process(
                            target=quant_worker,
-                            args=(i, shared_dict, w, self.moe_quant_type, self.group_size)
+                            args=(
+                                i,
+                                shared_dict,
+                                w,
+                                self.moe_quant_type,
+                                self.group_size,
+                            ),
                        )
                        p.start()
                        processes.append(p)
@@ -376,7 +383,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
                    quant_weight, scale = weight_quantize_rtn(
                        weight_tensor[i],
                        self.moe_quant_type,
-                        self.group_size # group_size
+                        self.group_size,  # group_size
                    )
                    weight_list.append(quant_weight)
                    weight_scale_list.append(scale)
@@ -389,7 +396,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod):
            quanted_weight_zeros = quanted_weight_scale * 8
            create_and_set_parameter(layer, zeros_name, quanted_weight_zeros)

-
    def apply(
        self,
        layer: nn.Layer,
--- a/fastdeploy/model_executor/layers/backends/gcu/quantization/init.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/init.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""""
+""" "
 gcu quantization
 """
 from .weight_only import GCUWeightOnlyLinearMethod
--- a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py
@@ -17,7 +17,9 @@
 import paddle

 from fastdeploy.model_executor.layers.quantization.weight_only import (
-    WeightOnlyConfig, WeightOnlyLinearMethod)
+    WeightOnlyConfig,
+    WeightOnlyLinearMethod,
+)
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.model_executor.ops.gcu import linear_quant, weight_quantize_rtn

@@ -35,7 +37,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        self.quant_config = quant_config
        self.group_size = -1

-
    def create_weights(self, layer):
        # The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
        weight_scale_shape = [layer.weight_shape[1]]
@@ -50,7 +51,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
            is_bias=False,
        )

-
    def process_prequanted_weights(self, layer, state_dict) -> None:
        """
        Process pre-quantized weights before applying them to the model
@@ -62,9 +62,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        quant_weight = get_tensor(state_dict.pop(layer.weight_key))
        weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
        layer.weight.set_value(quant_weight)
-        layer.weight_scale.set_value(
-            weight_scale.astype(paddle.get_default_dtype()))
-
+        layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype()))

    def process_loaded_weights(self, layer, weight) -> None:
        quanted_weight_tensor, weight_scale_tensor = weight_quantize_rtn(
@@ -74,9 +72,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        )

        layer.weight.set_value(quanted_weight_tensor)
-        layer.weight_scale.set_value(
-            weight_scale_tensor.astype(paddle.get_default_dtype()))
-
+        layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))

    @paddle.no_grad()
    def apply(self, layer, x):
--- a/fastdeploy/model_executor/layers/backends/npu/init.py
+++ b/fastdeploy/model_executor/layers/backends/npu/init.py
@@ -14,4 +14,4 @@

 """
 npu backend methods
-"""
+"""
--- a/fastdeploy/model_executor/layers/backends/xpu/init.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/init.py
@@ -18,4 +18,4 @@ xpu backend methods

 from .quantization.weight_only import XPUWeightOnlyLinearMethod

-__all__ = ['XPUWeightOnlyLinearMethod']
+__all__ = ["XPUWeightOnlyLinearMethod"]
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
@@ -13,11 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 import paddle
 from paddle import nn

 from fastdeploy.model_executor.layers.quantization.weight_only import (
-    WeightOnlyConfig, WeightOnlyLinearMethod)
+    WeightOnlyConfig,
+    WeightOnlyLinearMethod,
+)
 from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu


@@ -48,13 +51,10 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
            is_bias=False,
        )

-    def process_loaded_weights(self, layer: nn.Layer,
-                               weight: paddle.Tensor) -> None:
+    def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
        """
        loaded_weights using xpu special quantization
        """
-        quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(
-            weight, self.quant_config.algo, -1, -1)
-        layer.weight.set_value(
-            paddle.transpose(quanted_weight_tensor, [1, 0]))
+        quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight, self.quant_config.algo, -1, -1)
+        layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
        layer.weight_scale.set_value(weight_scale_tensor)
--- a/fastdeploy/model_executor/layers/backends/xpu/utils.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/utils.py
@@ -36,7 +36,8 @@ def xpu_clip_and_round(x: np.ndarray) -> np.ndarray:


 def xpu_quant_qkv_weight(
-        weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    weight_np: np.ndarray,
+) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """
    Quantize the query, key, and value weights for the Transformer model.

@@ -65,7 +66,8 @@ def xpu_quant_qkv_weight(


 def xpu_quant_weight(
-        weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    weight_np: np.ndarray,
+) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """
    Quantize the weight tensor for XPU devices.