fix bf16 and add comments (#4106)

2025-10-12 20:11:20 +08:00 · 2025-09-15 17:23:07 +08:00
parent 69aa2781a1
commit 113e330030
4 changed files with 8 additions and 4 deletions
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -498,6 +498,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
+                # Avoid redundant transpose of fused weights when weight_loader is called iteratively
                param.weight_need_transpose = False
            # Loaded weight is already fused on disk.
            shard_offsets = [
@@ -638,6 +639,7 @@ class QKVParallelLinear(ColumnParallelLinear):
            if weight_need_transpose:
                loaded_weight = get_tensor(loaded_weight)
                loaded_weight = loaded_weight.transpose([1, 0])
+                # Avoid redundant transpose of fused weights when weight_loader is called iteratively
                param.weight_need_transpose = False
            # Loaded weight is already fused on disk
            shard_offsets = [
--- a/fastdeploy/model_executor/layers/quantization/init.py
+++ b/fastdeploy/model_executor/layers/quantization/init.py
@@ -16,6 +16,8 @@ quantization module
 """
 from typing import Dict, List, Type

+from fastdeploy.utils import parse_quantization
+
 from .quant_base import QuantConfigBase

 QUANTIZATION_METHODS: List[str] = [
@@ -35,6 +37,8 @@ QUANTIZATION_METHODS: List[str] = [


 def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
+    if args.quantization is not None and isinstance(args.quantization, str):
+        args.quantization = parse_quantization(args.quantization)
    # 1.model_config.is_quantized
    # TODO(bukejiyu)  model_config.is_quantized is v0 only need to be removed in future
    if model_config.model_format == "torch":