diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 2c8e9fe33..d17ab1be3 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -498,6 +498,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): if weight_need_transpose: loaded_weight = get_tensor(loaded_weight) loaded_weight = loaded_weight.transpose([1, 0]) + # Avoid redundant transpose of fused weights when weight_loader is called iteratively param.weight_need_transpose = False # Loaded weight is already fused on disk. shard_offsets = [ @@ -638,6 +639,7 @@ class QKVParallelLinear(ColumnParallelLinear): if weight_need_transpose: loaded_weight = get_tensor(loaded_weight) loaded_weight = loaded_weight.transpose([1, 0]) + # Avoid redundant transpose of fused weights when weight_loader is called iteratively param.weight_need_transpose = False # Loaded weight is already fused on disk shard_offsets = [ diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 58f610ebf..6be178282 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -16,6 +16,8 @@ quantization module """ from typing import Dict, List, Type +from fastdeploy.utils import parse_quantization + from .quant_base import QuantConfigBase QUANTIZATION_METHODS: List[str] = [ @@ -35,6 +37,8 @@ QUANTIZATION_METHODS: List[str] = [ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): + if args.quantization is not None and isinstance(args.quantization, str): + args.quantization = parse_quantization(args.quantization) # 1.model_config.is_quantized # TODO(bukejiyu) model_config.is_quantized is v0 only need to be removed in future if model_config.model_format == "torch": diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index be66a5f27..0b226bf7b 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -240,7 +240,7 @@ def rename_offline_ckpt_suffix_to_fd_suffix( } moe_quant_type = "" dense_quant_type = "" - if fd_config.quant_config is None: + if fd_config.quant_config is not None: if fd_config.quant_config.name() == "mix_quant": moe_quant_type = fd_config.quant_config.moe_quant_type dense_quant_type = fd_config.quant_config.dense_quant_type diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 4ef043f12..3cf6fe928 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -44,7 +44,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal from fastdeploy.model_executor.layers.quantization import parse_quant_config from fastdeploy.platforms import current_platform -from fastdeploy.utils import get_logger, parse_quantization +from fastdeploy.utils import get_logger from fastdeploy.worker.worker_base import WorkerBase logger = get_logger("worker_process", "worker_process.log") @@ -655,8 +655,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: FDConfig: Initialized FastDeploy configuration object """ # RL rollout - if args.quantization is not None and isinstance(args.quantization, str): - args.quantization = parse_quantization(args.quantization) paddle.set_default_dtype(args.dtype) model_config = ModelConfig(vars(args)) device_config = DeviceConfig(vars(args))