[v1 loader]qwen Offline fp8 (#4036)

* support offline fp8 * update ut * update ut * update ut * fix * update * update
2025-09-27 04:46:16 +08:00 · 2025-09-15 13:44:11 +08:00
parent b1a5b756a3
commit 29ed617f0f
21 changed files with 440 additions and 138 deletions
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -57,7 +57,7 @@ class UnquantizedLinearMethod(QuantMethodBase):
            {
                **extra_weight_attrs,
                "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
-                "model_format": extra_weight_attrs.get("model_format", ""),
+                "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
            },
        )
@@ -341,10 +341,10 @@ class MergedReplicatedLinear(ReplicatedLinear):
        self.output_sizes = output_sizes
    def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
        loaded_weight = get_tensor(loaded_weight)
-        if model_format == "torch":
+        if weight_need_transpose:
            loaded_weight = loaded_weight.transpose([1, 0])
        assert loaded_shard_id in ["q_a", "kv_a"]
@@ -365,6 +365,12 @@ class MergedReplicatedLinear(ReplicatedLinear):
        assert param.shape == loaded_weight.shape, (
            f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
        )
        # Ensure loaded weight dtype matches model param dtype
        if loaded_weight.dtype != param.dtype:
            if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
                loaded_weight = loaded_weight.view(param.dtype)
            else:
                loaded_weight = loaded_weight.cast(param.dtype)
        param.copy_(loaded_weight, False)
@@ -483,15 +489,16 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        )
    def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
        if model_format == "torch":
            loaded_weight = get_tensor(loaded_weight)
            loaded_weight = loaded_weight.transpose([1, 0])
        output_dim = getattr(param, "output_dim", None)
        assert output_dim is not None
        shard_dim = -1 if output_dim else 0
        output_size = param.shape[shard_dim]
        if loaded_shard_id is None:
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
                param.weight_need_transpose = False
            # Loaded weight is already fused on disk.
            shard_offsets = [
                # (shard_id, shard_offset, shard_size)
@@ -506,6 +513,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        else:
            # split gate up
            assert loaded_shard_id in ["gate", "up"]
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
            # Tensor parallelism splits the weight along the output_dim
            if self.nranks != 1:
                dim = -1 if output_dim else 0
@@ -532,6 +542,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            assert param.shape == loaded_weight.shape, (
                f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
            )
            # Ensure loaded weight dtype matches model param dtype
            if loaded_weight.dtype != param.dtype:
                if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
                    loaded_weight = loaded_weight.view(param.dtype)
                else:
                    loaded_weight = loaded_weight.cast(param.dtype)
            param.copy_(loaded_weight, False)
    def load_state_dict(self, state_dict: dict):
@@ -604,11 +620,11 @@ class QKVParallelLinear(ColumnParallelLinear):
            add_bias=add_bias,
        )
-    def _get_shard_size_mapping(self, loaded_shard_id: str):
+    def _get_shard_size_mapping(self, loaded_shard_id: str, head_dim: int):
        shard_size_mapping = {
-            "q": self.num_heads_per_rank * self.head_dim,
+            "q": self.num_heads_per_rank * head_dim,
-            "k": self.kv_num_heads_per_rank * self.head_dim,
+            "k": self.kv_num_heads_per_rank * head_dim,
-            "v": self.kv_num_heads_per_rank * self.head_dim,
+            "v": self.kv_num_heads_per_rank * head_dim,
        }
        return shard_size_mapping.get(loaded_shard_id)
@@ -617,11 +633,12 @@ class QKVParallelLinear(ColumnParallelLinear):
        assert output_dim is not None
        dim = -1 if output_dim else 0
        head_dim = param.shape[dim] // (self.num_heads_per_rank + 2 * self.kv_num_heads_per_rank)
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if model_format == "torch":
+        if loaded_shard_id is None:
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
-        if loaded_shard_id is None:
+                param.weight_need_transpose = False
            # Loaded weight is already fused on disk
            shard_offsets = [
                # (shard_id, shard_offset, shard_size)
@@ -637,13 +654,16 @@ class QKVParallelLinear(ColumnParallelLinear):
        else:
            # split q k v
            assert loaded_shard_id in ["q", "k", "v"]
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
            # Tensor parallelism splits the weight along the output_dim
            if self.nranks != 1:
-                block_size = self._get_shard_size_mapping(loaded_shard_id)
+                block_size = self._get_shard_size_mapping(loaded_shard_id, head_dim)
                shard_id = self.local_rank if loaded_shard_id == "q" else self.local_rank // self.num_kv_head_replicas
                shard_offset = shard_id * block_size
-                shard_size = (shard_id + 1) * block_size
+                shard_size = block_size
-                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)
+                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_offset + shard_size)
            loaded_weight = get_tensor(loaded_weight)
@@ -663,10 +683,17 @@ class QKVParallelLinear(ColumnParallelLinear):
                param_shard_size = self.kv_num_heads_per_rank * head_dim
            if hasattr(param, "tensor_track"):
                param.tensor_track.mark(start=param_shard_offset, end=param_shard_offset + param_shard_size)
            param = slice_fn(param, output_dim, start=param_shard_offset, end=param_shard_offset + param_shard_size)
            assert param.shape == loaded_weight.shape, (
                f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
            )
            # Ensure loaded weight dtype matches model param dtype
            if loaded_weight.dtype != param.dtype:
                if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
                    loaded_weight = loaded_weight.view(param.dtype)
                else:
                    loaded_weight = loaded_weight.cast(param.dtype)
            param.copy_(loaded_weight, False)
    def load_weight(self, state_dict: dict):
--- a/fastdeploy/model_executor/layers/lm_head.py
+++ b/fastdeploy/model_executor/layers/lm_head.py
@@ -91,7 +91,7 @@ class ParallelLMHead(nn.Layer):
                    self.linear.weight,
                    {
                        "weight_loader": default_weight_loader(self.fd_config),
-                        "model_format": self.fd_config.model_config.model_format,
+                        "weight_need_transpose": self.fd_config.model_config.model_format == "torch",
                    },
                )
                if self.nranks > 1:
@@ -110,7 +110,7 @@ class ParallelLMHead(nn.Layer):
                    self.linear.weight,
                    {
                        "weight_loader": default_weight_loader(self.fd_config),
-                        "model_format": self.fd_config.model_config.model_format,
+                        "weight_need_transpose": self.fd_config.model_config.model_format == "torch",
                    },
                )
--- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py
@@ -216,18 +216,17 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
            dtype=layer.weight_dtype,
            default_initializer=paddle.nn.initializer.Constant(0),
        )
        set_weight_attrs(
            layer.up_gate_proj_weight,
            {
                "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
-                "model_format": extra_weight_attrs.get("model_format", ""),
+                "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
            },
        )
        set_weight_attrs(
            layer.down_proj_weight,
            {
                "weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
-                "model_format": extra_weight_attrs.get("model_format", ""),
+                "weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
            },
        )
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -1024,8 +1024,8 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
            ]
        self.up_gate_proj_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2]
        self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
-
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16:
+        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.up_gate_proj_weight = layer.create_parameter(
                shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
                dtype=layer.weight_dtype,
@@ -1037,7 +1037,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
                dtype=layer.weight_dtype,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
-
+            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            set_weight_attrs(
                layer.up_gate_proj_weight,
                {
@@ -1097,7 +1097,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
                    default_initializer=paddle.nn.initializer.Constant(0),
                ),
            )
-
+            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
            moe_extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
            set_weight_attrs(layer.up_gate_proj_weight, moe_extra_weight_attrs)
            set_weight_attrs(layer.down_proj_weight, moe_extra_weight_attrs)
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -57,7 +57,8 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
            ceil_div(layer.hidden_size, self.quant_config.weight_block_size[0]),
            ceil_div(layer.moe_intermediate_size, self.quant_config.weight_block_size[1]),
        ]
-        if self.quant_config.is_checkpoint_bf16:
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.up_gate_proj_weight = layer.create_parameter(
                shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
                dtype=layer.weight_dtype,
@@ -69,6 +70,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
                dtype=layer.weight_dtype,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            set_weight_attrs(
                layer.up_gate_proj_weight,
                {
@@ -127,6 +129,25 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
                    default_initializer=paddle.nn.initializer.Constant(0),
                ),
            )
            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
            extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
            set_weight_attrs(
                getattr(layer, up_gate_proj_weight_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, up_gate_proj_scale_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, down_proj_weight_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, down_proj_scale_name),
                extra_weight_attrs,
            )
    def process_weights_after_loading(self, layer):
        """ """
@@ -169,6 +190,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
                getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size
            )
            weight[expert_id].copy_(weight_quant, False)
        getattr(layer, unquantized_weight_name).value().get_tensor()._clear()
        # create weight
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -72,7 +72,8 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
            layer.moe_intermediate_size,
            layer.hidden_size,
        ]
-        if self.quant_config.is_checkpoint_bf16:
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.up_gate_proj_weight = layer.create_parameter(
                shape=self.up_gate_proj_weight_shape,
                dtype=layer.weight_dtype,
@@ -84,6 +85,8 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
                dtype=layer.weight_dtype,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            set_weight_attrs(
                layer.up_gate_proj_weight,
                {
@@ -136,6 +139,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
                    default_initializer=paddle.nn.initializer.Constant(0),
                ),
            )
            # support cache feature in future
    def process_loaded_weights(self, layer: nn.Layer, state_dict):
        """
@@ -723,7 +727,8 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
            ceil_div(layer.hidden_size, self.quant_config.weight_block_size[0]),
            ceil_div(layer.moe_intermediate_size, self.quant_config.weight_block_size[1]),
        ]
-        if self.quant_config.is_checkpoint_bf16:
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.up_gate_proj_weight = layer.create_parameter(
                shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
                dtype=layer.weight_dtype,
@@ -735,6 +740,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
                dtype=layer.weight_dtype,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            set_weight_attrs(
                layer.up_gate_proj_weight,
                {
@@ -794,6 +800,26 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
                ),
            )
            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
            extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
            set_weight_attrs(
                getattr(layer, up_gate_proj_weight_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, up_gate_proj_scale_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, down_proj_weight_name),
                extra_weight_attrs,
            )
            set_weight_attrs(
                getattr(layer, down_proj_scale_name),
                extra_weight_attrs,
            )
    def process_weights_after_loading(self, layer):
        """ """
        if not self.quant_config.is_checkpoint_bf16:
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -206,20 +206,19 @@ class FusedMoE(nn.Layer):
        if shard_id is None:
            # 1.gate up fused in disk
-            model_format = getattr(param, "model_format", "")
+            weight_need_transpose = getattr(param, "weight_need_transpose", False)
            is_torch_model = model_format == "torch"
            output_size = param[expert_id - self.expert_id_offset].shape[SHARD_ID_TO_SHARDED_DIM["gate"]]
            per_rank = output_size // 2
            start = self.tp_rank * per_rank
            loaded_weight_shard_gate = slice_fn(
-                loaded_weight, is_torch_model ^ SHARD_ID_TO_SHARDED_DIM["gate"], start, start + per_rank
+                loaded_weight, weight_need_transpose ^ SHARD_ID_TO_SHARDED_DIM["gate"], start, start + per_rank
            )
            self._load_gate_up_weight(
                param, expert_id, loaded_weight_shard_gate, "gate", SHARD_ID_TO_SHARDED_DIM["gate"], is_sharded=True
            )
            start_up = output_size // 2 * self.tp_size + self.tp_rank * per_rank
            loaded_weight_shard_up = slice_fn(
-                loaded_weight, is_torch_model ^ SHARD_ID_TO_SHARDED_DIM["up"], start_up, start_up + per_rank
+                loaded_weight, weight_need_transpose ^ SHARD_ID_TO_SHARDED_DIM["up"], start_up, start_up + per_rank
            )
            self._load_gate_up_weight(
                param, expert_id, loaded_weight_shard_up, "up", SHARD_ID_TO_SHARDED_DIM["up"], is_sharded=True
@@ -236,10 +235,9 @@ class FusedMoE(nn.Layer):
            )
    def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None, is_sharded=False):
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
        is_torch_model = model_format == "torch"
        if self.tp_size > 1 and not is_sharded:
-            tp_shard_dim = is_torch_model ^ shard_dim
+            tp_shard_dim = weight_need_transpose ^ shard_dim
            weight_dim = -1 if tp_shard_dim else 0
            if isinstance(loaded_weight, (np.ndarray, paddle.Tensor)):
                size = loaded_weight.shape[weight_dim]
@@ -275,13 +273,17 @@ class FusedMoE(nn.Layer):
        assert expert_param.shape == loaded_weight.shape, (
            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
        )
        if expert_param.dtype != loaded_weight.dtype:
            if loaded_weight.dtype == paddle.int8 and expert_param.dtype == paddle.float8_e4m3fn:
                loaded_weight = loaded_weight.view(expert_param.dtype)
            else:
                loaded_weight = loaded_weight.cast(expert_param.dtype)
        expert_param.copy_(loaded_weight, False)
    def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim=None):
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
        is_torch_model = model_format == "torch"
        if self.tp_size > 1 and shard_dim is not None:
-            tp_shard_dim = is_torch_model ^ shard_dim
+            tp_shard_dim = weight_need_transpose ^ shard_dim
            dim = -1 if tp_shard_dim else 0
            if isinstance(loaded_weight, paddle.Tensor):
                size = loaded_weight.shape[dim]
@@ -302,6 +304,11 @@ class FusedMoE(nn.Layer):
        assert expert_param.shape == loaded_weight.shape, (
            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
        )
        if expert_param.dtype != loaded_weight.dtype:
            if loaded_weight.dtype == paddle.int8 and expert_param.dtype == paddle.float8_e4m3fn:
                loaded_weight = loaded_weight.view(expert_param.dtype)
            else:
                loaded_weight = loaded_weight.cast(expert_param.dtype)
        expert_param.copy_(loaded_weight, False)
    def _load_expert_weight(
--- a/fastdeploy/model_executor/layers/quantization/init.py
+++ b/fastdeploy/model_executor/layers/quantization/init.py
@@ -34,6 +34,72 @@ QUANTIZATION_METHODS: List[str] = [
 ]
 def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
    # 1.model_config.is_quantized
    # TODO(bukejiyu)  model_config.is_quantized is v0 only need to be removed in future
    if model_config.model_format == "torch":
        quantization_config = model_config.quantization_config
        if quantization_config is not None:
            model_config.is_quantized = True
    else:
        quantization_config = model_config.quantization_config
        if not model_config.is_quantized:
            if quantization_config is not None:
                if "is_quantized" in quantization_config:
                    model_config.is_quantized = quantization_config["is_quantized"]
                elif "kv_cache_quant_type" not in quantization_config:
                    model_config.is_quantized = True
            if quantization_config is not None and quantization_config.get("quantization", None) is None:
                raise ValueError(
                    "quantization_config should have a key named 'quantization' for specify quant config."
                )
    quant_config_name = None
    if quantization_config is not None:
        quant_config_name = _get_offline_quant_config_name(
            quantization_config, model_config.model_format == "torch", is_v1_loader
        )
    elif args.quantization is not None:
        quantization_config = {}
        try:
            quantization_config.update(args.quantization)
            quant_config_name = quantization_config["quantization"]
        except:
            quant_config_name = args.quantization["quantization"]
            quantization_config["quantization"] = quant_config_name
        # Special handling for Ernie models
        if quant_config_name == "wint4" and is_ernie:
            quantization_config["dense_quant_type"] = "wint8"
            quantization_config["moe_quant_type"] = "wint4"
            quantization_config["quantization"] = "mix_quant"
            quant_config_name = "mix_quant"
    else:
        quant_config_name = None
    if quant_config_name is None:
        quant_config = None
    else:
        if not quantization_config.get("is_quantized"):
            quantization_config["is_quantized"] = model_config.is_quantized
        quant_cls = get_quantization_config(quant_config_name)
        quant_config = quant_cls.from_config(quantization_config)
    return quant_config
 def _get_offline_quant_config_name(quantization_config, is_torch_weight, is_v1_loader):
    if is_torch_weight:
        # only support block_wise_fp8 now
        quant_method = quantization_config.get("quant_method")
        has_block_size = "weight_block_size" in quantization_config
        if quant_method == "fp8" and has_block_size:
            quant_config_name = "block_wise_fp8"
        else:
            raise ValueError("Torch weight offline quantization only supports block-wise FP8.")
    else:
        quant_config_name = quantization_config["quantization"]
    return quant_config_name
 def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
    """
    Get the quantization config class by the quantization name.
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -53,7 +53,7 @@ class BlockWiseFP8Config(QuantConfigBase):
    @classmethod
    def from_config(cls, config: dict) -> "BlockWiseFP8Config":
        weight_block_size = config.get("weight_block_size", [128, 128])
-        is_checkpoint_bf16 = config.get("is_checkpoint_bf16", False)
+        is_checkpoint_bf16 = not config.get("is_quantized", False)
        return cls(weight_block_size, is_checkpoint_bf16)
    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
@@ -89,13 +89,15 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
        self.quant_config = quant_config
    def create_weights(self, layer, **extra_weight_attrs):
-        if self.quant_config.is_checkpoint_bf16:
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.weight = layer.create_parameter(
                shape=layer.weight_shape,
                dtype=layer.weight_dtype,
                is_bias=False,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            quant_attrs = extra_weight_attrs
            if isinstance(layer, MergedColumnParallelLinear) or isinstance(layer, QKVParallelLinear):
                quant_attrs = {
@@ -120,14 +122,28 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
            layer.weight_scale_inv = layer.create_parameter(
                shape=[
-                    (layer.output_size + self.quant_config.weight_block_size[0] - 1)
+                    (layer.weight_shape[0] + self.quant_config.weight_block_size[0] - 1)
                    // self.quant_config.weight_block_size[0],
-                    (layer.input_size + self.quant_config.weight_block_size[1] - 1)
+                    (layer.weight_shape[1] + self.quant_config.weight_block_size[1] - 1)
                    // self.quant_config.weight_block_size[1],
                ],
                dtype="float32",
                is_bias=False,
            )
            extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
            set_weight_attrs(
                layer.weight,
                extra_weight_attrs,
            )
            set_weight_attrs(
                layer.weight_scale_inv,
                {
                    **extra_weight_attrs,
                    "is_scale": True,
                },
            )
    def process_weights_after_loading(self, layer) -> None:
        if not self.quant_config.is_checkpoint_bf16:
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -37,7 +37,7 @@ class MixQuantConfig(QuantConfigBase):
        is_channel_wise: bool = False,
        has_zero_point: bool = False,
        is_permuted: bool = True,
-        is_checkpoint_bf16: bool = False,
+        is_quantized: bool = False,
        hadamard_block_size: int = 128,
    ) -> None:
        super().__init__()
@@ -54,7 +54,8 @@ class MixQuantConfig(QuantConfigBase):
        self.quant_min_bound = 0
        self.quant_round_type = 0
        self.is_permuted = is_permuted
-        self.is_checkpoint_bf16 = is_checkpoint_bf16
+        self.is_checkpoint_bf16 = not is_quantized
        self.is_quantized = is_quantized
        self.hadamard_block_size = hadamard_block_size
    def name(self) -> str:
@@ -70,7 +71,7 @@ class MixQuantConfig(QuantConfigBase):
            config.get("is_channel_wise", False),
            config.get("has_zero_point", False),
            config.get("is_permuted", True),
-            config.get("is_checkpoint_bf16", False),
+            config.get("is_quantized", False),
            config.get("hadamard_block_size", 128),
        )
@@ -82,7 +83,7 @@ class MixQuantConfig(QuantConfigBase):
                    .from_config(
                        {
                            "is_permuted": self.is_permuted,
-                            "is_checkpoint_bf16": self.is_checkpoint_bf16,
+                            "is_quantized": self.is_quantized,
                            "hadamard_block_size": self.hadamard_block_size,
                        }
                    )
@@ -94,7 +95,7 @@ class MixQuantConfig(QuantConfigBase):
                    .from_config(
                        {
                            "is_permuted": self.is_permuted,
-                            "is_checkpoint_bf16": self.is_checkpoint_bf16,
+                            "is_quantized": self.is_quantized,
                            "hadamard_block_size": self.hadamard_block_size,
                        }
                    )
@@ -112,6 +113,6 @@ class MixQuantConfig(QuantConfigBase):
        else:
            return (
                get_quantization_config(self.dense_quant_type)
-                .from_config({"is_checkpoint_bf16": self.is_checkpoint_bf16})
+                .from_config({"is_quantized": self.is_quantized})
                .get_quant_method(layer)
            )
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -65,7 +65,7 @@ class WeightOnlyConfig(QuantConfigBase):
    @classmethod
    def from_config(cls, config: dict) -> "WeightOnlyConfig":
        algo = config["algo"]
-        is_checkpoint_bf16 = config.get("is_checkpoint_bf16", False)
+        is_checkpoint_bf16 = not config.get("is_quantized", False)
        return cls(algo, is_checkpoint_bf16)
    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
@@ -162,7 +162,7 @@ class WINT8Config(WeightOnlyConfig):
    @classmethod
    def from_config(cls, config: dict) -> "WINT8Config":
-        is_checkpoint_bf16 = config.get("is_checkpoint_bf16", False)
+        is_checkpoint_bf16 = not config.get("is_quantized", False)
        return cls(is_checkpoint_bf16)
    def name(self) -> str:
@@ -182,7 +182,7 @@ class WINT4Config(WeightOnlyConfig):
    @classmethod
    def from_config(cls, config: dict) -> "WINT4Config":
-        is_checkpoint_bf16 = config.get("is_checkpoint_bf16", False)
+        is_checkpoint_bf16 = not config.get("is_quantized", False)
        return cls(is_checkpoint_bf16)
    def name(self) -> str:
@@ -202,13 +202,15 @@ class WeightOnlyLinearMethod(QuantMethodBase):
        self.quant_config = quant_config
    def create_weights(self, layer, **extra_weight_attrs):
-        if self.quant_config.is_checkpoint_bf16:
+        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.weight = layer.create_parameter(
                shape=layer.weight_shape,
                dtype=layer.weight_dtype,
                is_bias=False,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            quant_attrs = extra_weight_attrs
            if (
                isinstance(layer, MergedColumnParallelLinear)
@@ -256,6 +258,7 @@ class WeightOnlyLinearMethod(QuantMethodBase):
                {
                    "weight_loader": weight_loader,
                    "output_dim": output_dim,
                    "weight_need_transpose": not extra_weight_attrs.get("model_format") == "torch",
                },
            )
--- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -60,7 +60,7 @@ class WFP8AFP8Config(QuantConfigBase):
    @classmethod
    def from_config(cls, config: dict) -> "WFP8AFP8Config":
        """ """
-        is_checkpoint_bf16 = config.get("is_checkpoint_bf16", False)
+        is_checkpoint_bf16 = not config.get("is_quantized", False)
        return cls(is_checkpoint_bf16=is_checkpoint_bf16)
    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
@@ -92,13 +92,14 @@ class WFP8AFP8LinearMethod(QuantMethodBase):
                (weight_shape[i] + weight_block_size[i] - 1) // weight_block_size[i] if weight_block_size[i] > 0 else 1
            )
        scale_shape = scale_shape[::-1]
-        if self.quant_config.is_checkpoint_bf16:
+        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
            layer.weight = layer.create_parameter(
                shape=weight_shape,
                dtype=layer.weight_dtype,
                is_bias=False,
                default_initializer=paddle.nn.initializer.Constant(0),
            )
            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
            quant_attrs = extra_weight_attrs
            if isinstance(layer, MergedColumnParallelLinear) or isinstance(layer, QKVParallelLinear):
                quant_attrs = {
--- a/fastdeploy/model_executor/load_weight_utils.py
+++ b/fastdeploy/model_executor/load_weight_utils.py
@@ -98,7 +98,7 @@ def is_weight_cache_enabled(fd_config, weight_cache_path=".cache"):
                f"Loading will prioritize cached models. Users are responsible for ensuring the saved model is correct. If any error occurs, deleting the cache at {weight_cache_dir} may resolve it."
            )
            enable_cache = True
-            weight_cache_context = switch_config_context(fd_config.quant_config, "is_checkpoint_bf16", False)
+            weight_cache_context = switch_config_context(fd_config.quant_config, "is_quantized", True)
    return enable_cache, weight_cache_dir, weight_cache_context
@@ -150,7 +150,8 @@ def save_model(model_arg_name="model", config_arg_name="fd_config"):
                )
                _save_model(model.state_dict(), os.path.join(tp_weight_cache_dir, "cache.pdparams"))
            else:
-                logger.info("Weights are already cached, skip saving")
+                reason = "weights already cached" if envs.FD_ENABLE_MODEL_LOAD_CACHE else "cache disabled"
                logger.info(f"Skip saving ,{reason}")
            return result
        return wrapper
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -527,6 +527,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
        from fastdeploy.model_executor.utils import (
            default_weight_loader,
            process_weights_after_loading,
            rename_offline_ckpt_suffix_to_fd_suffix,
        )
        general_params_mapping = [
@@ -564,15 +565,20 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
                param_down_proj_name="experts.down_proj_",
                num_experts_start_offset=num_experts_start_offset,
            )
-        all_param_mapping = general_params_mapping + expert_params_mapping
+        all_param_mapping = [
-
+            (param, weight, exp, shard, False) for param, weight, exp, shard in general_params_mapping
        ] + [(param, weight, exp, shard, True) for param, weight, exp, shard in expert_params_mapping]
        checkpoint_to_fd_key_fn = rename_offline_ckpt_suffix_to_fd_suffix(
            fd_config=self.fd_config, ckpt_weight_suffix="quant_weight", ckpt_scale_suffix="weight_scale"
        )
        params_dict = dict(self.named_parameters())
        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
        for loaded_weight_name, loaded_weight in weights_iterator:
            loaded_weight_name = loaded_weight_name.replace("model", "ernie")
-            for param_name, weight_name, exp_id, shard_id in all_param_mapping:
+            for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
                loaded_weight_name = checkpoint_to_fd_key_fn(loaded_weight_name, is_moe)
                model_param_name = loaded_weight_name.replace(weight_name, param_name)
                if model_param_name not in params_dict:
                    continue
@@ -583,6 +589,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
            else:
                expert_id = None
                shard_id = None
                loaded_weight_name = checkpoint_to_fd_key_fn(loaded_weight_name, is_moe=False)
                model_param_name = loaded_weight_name
                if model_param_name not in params_dict.keys():
                    continue
--- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py
@@ -193,16 +193,16 @@ class VisionFlashAttention2(nn.Layer):
            self.qkv = nn.Linear(dim, dim * 3, bias_attr=True)
            self.proj = nn.Linear(dim, dim)
-        set_weight_attrs(self.qkv.weight, {"model_format": model_format})
+        set_weight_attrs(self.qkv.weight, {"weight_need_transpose": model_format == "torch"})
-        set_weight_attrs(self.proj.weight, {"model_format": model_format})
+        set_weight_attrs(self.proj.weight, {"weight_need_transpose": model_format == "torch"})
        self.head_dim = dim // num_heads  # must added
        self.num_heads = num_heads
        self.hidden_size = dim
        self.num_heads_per_rank = divide(self.num_heads, self.tensor_parallel_degree)
    def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if model_format == "torch":
+        if weight_need_transpose:
            loaded_weight = loaded_weight.transpose([1, 0])
        load_bias = getattr(param, "load_bias", None)
        if load_bias:
@@ -358,8 +358,8 @@ class VisionMlp(nn.Layer):
            self.fc1 = nn.Linear(dim, hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, dim)
-        set_weight_attrs(self.fc1.weight, {"model_format": model_format})
+        set_weight_attrs(self.fc1.weight, {"weight_need_transpose": model_format == "torch"})
-        set_weight_attrs(self.fc2.weight, {"model_format": model_format})
+        set_weight_attrs(self.fc2.weight, {"weight_need_transpose": model_format == "torch"})
        self.act = ACT2FN[hidden_act]
@@ -528,8 +528,10 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel):
            in_channels=config.vision_config.in_channels,
            embed_dim=config.vision_config.embed_dim,
        )
        model_format = getattr(config, "model_format", "")
-        set_weight_attrs(self.patch_embed.proj.weight, {"model_format": model_format})
+
        set_weight_attrs(self.patch_embed.proj.weight, {"weight_need_transpose": model_format == "torch"})
        head_dim = config.vision_config.embed_dim // config.vision_config.num_heads
        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
--- a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
@@ -181,8 +181,8 @@ class VariableResolutionResamplerModel(nn.Layer):
                nn.Linear(self.spatial_dim, self.spatial_dim),
                nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
            )
-            set_weight_attrs(self.spatial_linear[0].weight, {"model_format": config.model_format})
+            set_weight_attrs(self.spatial_linear[0].weight, {"weight_need_transpose": config.model_format == "torch"})
-            set_weight_attrs(self.spatial_linear[2].weight, {"model_format": config.model_format})
+            set_weight_attrs(self.spatial_linear[2].weight, {"weight_need_transpose": config.model_format == "torch"})
            if self.use_temporal_conv:
                self.temporal_linear = nn.Sequential(
@@ -191,12 +191,16 @@ class VariableResolutionResamplerModel(nn.Layer):
                    nn.Linear(self.spatial_dim, self.spatial_dim),
                    nn.LayerNorm(self.spatial_dim, epsilon=1e-6),
                )
-                set_weight_attrs(self.temporal_linear[0].weight, {"model_format": config.model_format})
+                set_weight_attrs(
-                set_weight_attrs(self.temporal_linear[2].weight, {"model_format": config.model_format})
+                    self.temporal_linear[0].weight, {"weight_need_transpose": config.model_format == "torch"}
                )
                set_weight_attrs(
                    self.temporal_linear[2].weight, {"weight_need_transpose": config.model_format == "torch"}
                )
            self.mlp = nn.Linear(self.spatial_dim, self.out_dim)
-            set_weight_attrs(self.mlp.weight, {"model_format": config.model_format})
+            set_weight_attrs(self.mlp.weight, {"weight_need_transpose": config.model_format == "torch"})
            out_config = deepcopy(config)
            out_config.hidden_size = out_dim
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 import re
 from contextlib import contextmanager
 from typing import Any, Optional, Union
@@ -158,8 +159,8 @@ def default_weight_loader(fd_config: FDConfig) -> None:
    def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
        """fn"""
        output_dim = getattr(param, "output_dim", None)
-        model_format = getattr(param, "model_format", "")
+        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if model_format == "torch":
+        if weight_need_transpose:
            loaded_weight = get_tensor(loaded_weight)
            loaded_weight = loaded_weight.transpose([1, 0])
        # Tensor parallelism splits the weight along the output_dim
@@ -177,6 +178,9 @@ def default_weight_loader(fd_config: FDConfig) -> None:
        loaded_weight = get_tensor(loaded_weight)
        # mlp.gate.weight is precision-sensitive, so we cast it to float32 for computation
        if param.dtype != loaded_weight.dtype:
            if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn:
                loaded_weight = loaded_weight.view(param.dtype)
            else:
                loaded_weight = loaded_weight.cast(param.dtype)
        if param.shape != loaded_weight.shape:
            # for e_score_correction_bias
@@ -210,3 +214,50 @@ def switch_config_context(config_obj, config_attr_name, value):
        yield
    finally:
        setattr(config_obj, config_attr_name, origin_value)
 def rename_offline_ckpt_suffix_to_fd_suffix(
    fd_config, ckpt_weight_suffix: str = "quant_weight", ckpt_scale_suffix="weight_scale"
 ):
    """
    Create a function to rename checkpoint key suffixes for FastDeploy.
    Replaces the original suffix (default "weight_scale") with the FD target
    suffix (default "quant_weight"). Only the suffix is changed.
    Args:
        fd_config: FastDeploy configuration.
        ckpt_weight_suffix: Original checkpoint key suffix.
        ckpt_scale_suffix: Target FastDeploy key suffix.
    Returns:
        Callable: Function that renames checkpoint keys.
    """
    fd_suffix_map = {}  # noqa: F841
    fp8_suffix_map = {
        ckpt_weight_suffix: "weight",
        ckpt_scale_suffix: "weight_scale_inv",
    }
    moe_quant_type = ""
    dense_quant_type = ""
    if fd_config.quant_config is None:
        if fd_config.quant_config.name() == "mix_quant":
            moe_quant_type = fd_config.quant_config.moe_quant_type
            dense_quant_type = fd_config.quant_config.dense_quant_type
        else:
            moe_quant_type = fd_config.quant_config.name()
            dense_quant_type = fd_config.quant_config.name()
    def fn(loaded_weight_name, is_moe):
        if fd_config.quant_config is None or fd_config.quant_config.is_checkpoint_bf16:
            return loaded_weight_name
        # Can be extended to other offline quantization suffixes if needed.
        if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
            fd_suffix_map = fp8_suffix_map
        for ckpt_suffix, fd_suffix in fd_suffix_map.items():
            if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
                loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)
                return loaded_weight_name
        return loaded_weight_name
    return fn
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -42,7 +42,7 @@ from fastdeploy.config import (
 from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
 from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
 from fastdeploy.inter_communicator import IPCSignal
-from fastdeploy.model_executor.layers.quantization import get_quantization_config
+from fastdeploy.model_executor.layers.quantization import parse_quant_config
 from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger, parse_quantization
 from fastdeploy.worker.worker_base import WorkerBase
@@ -698,50 +698,12 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    if getattr(model_config, "num_hidden_layers", None) is None:
        raise ValueError("num_hidden_layers is None")
-    quantization_config = model_config.quantization_config
+    quant_config = parse_quant_config(
-    if not model_config.is_quantized:
+        args,
-        if quantization_config is not None:
+        model_config,
-            if "is_quantized" in quantization_config:
+        is_ernie=ErnieArchitectures.contains_ernie_arch(model_config.architectures),
-                model_config.is_quantized = quantization_config["is_quantized"]
+        is_v1_loader=load_config.load_choices == "default_v1",
-            elif "kv_cache_quant_type" not in quantization_config:
+    )
                model_config.is_quantized = True
    quant_config_name = None
    if quantization_config is not None and quantization_config.get("quantization", None) is None:
        raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.")
    if quantization_config is not None:
        quant_config_name = quantization_config["quantization"]
        # TODO(YuanRisheng) is_checkpoint_bf16 may need to be removed and replaced by is_quantized in future
        if "kv_cache_quant_type" in quantization_config and load_config.load_choices == "default_v1":
            quantization_config["is_checkpoint_bf16"] = True
    elif args.quantization is not None:
        quantization_config = {}
        try:
            quantization_config.update(args.quantization)
            quant_config_name = quantization_config["quantization"]
        except:
            quant_config_name = args.quantization["quantization"]
            quantization_config["quantization"] = quant_config_name
        # Only v1 loader sets is_checkpoint_bf16=True during dynamic quantization.
        if load_config.load_choices == "default_v1":
            quantization_config["is_checkpoint_bf16"] = True
        # Special handling for Ernie models
        is_ernie = ErnieArchitectures.contains_ernie_arch(model_config.architectures)
        if quant_config_name == "wint4" and is_ernie:
            quantization_config["dense_quant_type"] = "wint8"
            quantization_config["moe_quant_type"] = "wint4"
            quantization_config["quantization"] = "mix_quant"
            quant_config_name = "mix_quant"
    else:
        quant_config_name = None
    if quant_config_name is None:
        quant_config = None
    else:
        quant_cls = get_quantization_config(quant_config_name)
        quant_config = quant_cls.from_config(quantization_config)
    # Log quantization info
    logger.info("===========quantization_config==============")
@@ -751,7 +713,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        else:
            logger.info("Model Status: Original (will apply online quantization)")
-        logger.info(f"{quantization_config}")
+        logger.info(f"{model_config.quantization_config}")
    else:
        logger.info("No quantization config found and use original weight and act dtype.")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -53,12 +53,8 @@ class FDRunner:
        req_outputs = self.llm.generate(prompts, sampling_params=sampling_params, **kwargs)
        outputs: list[tuple[list[list[int]], list[str]]] = []
        sample_output_ids: list[list[int]] = []
        sample_output_strs: list[str] = []
        for output in req_outputs:
-            sample_output_ids.append(output.outputs.token_ids)
+            outputs.append((output.outputs.token_ids, output.outputs.text))
            sample_output_strs.append(output.outputs.text)
            outputs.append((sample_output_ids, sample_output_strs))
        return outputs
    def generate_topp0(
@@ -69,7 +65,7 @@ class FDRunner:
    ) -> list[tuple[list[int], str]]:
        from fastdeploy.engine.sampling_params import SamplingParams
-        topp_params = SamplingParams(temperature=0.1, top_p=0, max_tokens=max_tokens)
+        topp_params = SamplingParams(temperature=0.0, top_p=0, max_tokens=max_tokens)
        outputs = self.generate(prompts, topp_params, **kwargs)
        return outputs
--- a/tests/model_loader/test_offline_model.py
+++ b/tests/model_loader/test_offline_model.py
@@ -0,0 +1,104 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import sys
 import pytest
 prompts = ["解释下'温故而知新'", "who are you?"]
 current_dir = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(current_dir, ".."))
 if project_root not in sys.path:
    sys.path.insert(0, project_root)
 from tests.model_loader.utils import (
    form_model_get_output_topp0,
    get_torch_model_path,
    run_with_timeout,
 )
 FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
 FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
 model_param_map = {
    "Qwen3-30B-A3B-FP8": {
        "tensor_parallel_size": 2,
        "quantizations": [
            {
                "quant_type": "None",
                "backend": "triton",
                "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
            },
        ],
    },
 }
 params = []
 for model, cfg in model_param_map.items():
    for q in cfg["quantizations"]:
        if isinstance(q, dict):
            quant, backend, env = q["quant_type"], q.get("backend", "default"), q.get("env", {})
        else:
            quant, backend, env = q, "default", {}
        params.append(
            pytest.param(
                model,
                cfg.get("tensor_parallel_size", 1),
                cfg.get("max_model_len", 1024),
                quant,
                cfg.get("max_tokens", 32),
                env,
                marks=[pytest.mark.core_model],
                id=f"offline_quant_{model}.{quant}.{backend}",
            )
        )
@pytest.mark.parametrize(
    "model_name_or_path,tensor_parallel_size,max_model_len,quantization,max_tokens,env",
    params,
 )
 def test_offline_model(
    fd_runner,
    model_name_or_path: str,
    tensor_parallel_size: int,
    max_model_len: int,
    max_tokens: int,
    quantization: str,
    env,
    monkeypatch,
 ) -> None:
    torch_model_path = get_torch_model_path(model_name_or_path)
    if env:
        for k, v in env.items():
            monkeypatch.setenv(k, v)
    _ = run_with_timeout(
        target=form_model_get_output_topp0,
        args=(
            fd_runner,
            torch_model_path,
            tensor_parallel_size,
            max_model_len,
            max_tokens,
            quantization,
            "default_v1",
            FD_ENGINE_QUEUE_PORT,
            prompts,
            FD_CACHE_QUEUE_PORT,
        ),
    )
--- a/tests/model_loader/utils.py
+++ b/tests/model_loader/utils.py
@@ -181,7 +181,7 @@ def check_tokens_id_and_text_close(
    outputs_1_lst: TokensIdText,
    name_0: str,
    name_1: str,
-    warn_on_mismatch: bool = True,
+    threshold: float = 0.0,
 ) -> None:
    assert len(outputs_0_lst) == len(outputs_1_lst)
@@ -190,10 +190,21 @@ def check_tokens_id_and_text_close(
        output_ids_0, output_str_0 = outputs_0
        output_ids_1, output_str_1 = outputs_1
        if threshold > 0:
            diff_rate = calculate_diff_rate(output_str_0, output_str_1)
            if diff_rate >= threshold:
                fail_msg = (
                    f"Test{prompt_idx}:"
                    f"\n{name_0}:\t{output_str_0!r}"
                    f"\n{name_1}:\t{output_str_1!r}"
                    f"\nDiff rate: {diff_rate:.4f} >= threshold: {threshold}"
                )
                raise AssertionError(fail_msg)
        else:
            # Loop through generated tokens.
            for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
                is_tok_mismatch = output_id_0 != output_id_1
-            if is_tok_mismatch and warn_on_mismatch:
+                if is_tok_mismatch:
                    fail_msg = (
                        f"Test{prompt_idx}:"
                        f"\nMatched tokens:\t{output_ids_0[:idx]}"
@@ -201,10 +212,6 @@ def check_tokens_id_and_text_close(
                        f"\n{name_1}:\t{output_str_1!r}"
                    )
                    raise AssertionError(fail_msg)
    else:
        if output_str_0 != output_str_1 and warn_on_mismatch:
            fail_msg = f"Test{prompt_idx}:" f"\n{name_0}:\t{output_str_0!r}" f"\n{name_1}:\t{output_str_1!r}"
            raise AssertionError(fail_msg)
 def calculate_diff_rate(text1, text2):