mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
fix bf16 and add comments (#4106)
This commit is contained in:
@@ -498,6 +498,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
|
|||||||
if weight_need_transpose:
|
if weight_need_transpose:
|
||||||
loaded_weight = get_tensor(loaded_weight)
|
loaded_weight = get_tensor(loaded_weight)
|
||||||
loaded_weight = loaded_weight.transpose([1, 0])
|
loaded_weight = loaded_weight.transpose([1, 0])
|
||||||
|
# Avoid redundant transpose of fused weights when weight_loader is called iteratively
|
||||||
param.weight_need_transpose = False
|
param.weight_need_transpose = False
|
||||||
# Loaded weight is already fused on disk.
|
# Loaded weight is already fused on disk.
|
||||||
shard_offsets = [
|
shard_offsets = [
|
||||||
@@ -638,6 +639,7 @@ class QKVParallelLinear(ColumnParallelLinear):
|
|||||||
if weight_need_transpose:
|
if weight_need_transpose:
|
||||||
loaded_weight = get_tensor(loaded_weight)
|
loaded_weight = get_tensor(loaded_weight)
|
||||||
loaded_weight = loaded_weight.transpose([1, 0])
|
loaded_weight = loaded_weight.transpose([1, 0])
|
||||||
|
# Avoid redundant transpose of fused weights when weight_loader is called iteratively
|
||||||
param.weight_need_transpose = False
|
param.weight_need_transpose = False
|
||||||
# Loaded weight is already fused on disk
|
# Loaded weight is already fused on disk
|
||||||
shard_offsets = [
|
shard_offsets = [
|
||||||
|
@@ -16,6 +16,8 @@ quantization module
|
|||||||
"""
|
"""
|
||||||
from typing import Dict, List, Type
|
from typing import Dict, List, Type
|
||||||
|
|
||||||
|
from fastdeploy.utils import parse_quantization
|
||||||
|
|
||||||
from .quant_base import QuantConfigBase
|
from .quant_base import QuantConfigBase
|
||||||
|
|
||||||
QUANTIZATION_METHODS: List[str] = [
|
QUANTIZATION_METHODS: List[str] = [
|
||||||
@@ -35,6 +37,8 @@ QUANTIZATION_METHODS: List[str] = [
|
|||||||
|
|
||||||
|
|
||||||
def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
||||||
|
if args.quantization is not None and isinstance(args.quantization, str):
|
||||||
|
args.quantization = parse_quantization(args.quantization)
|
||||||
# 1.model_config.is_quantized
|
# 1.model_config.is_quantized
|
||||||
# TODO(bukejiyu) model_config.is_quantized is v0 only need to be removed in future
|
# TODO(bukejiyu) model_config.is_quantized is v0 only need to be removed in future
|
||||||
if model_config.model_format == "torch":
|
if model_config.model_format == "torch":
|
||||||
|
@@ -240,7 +240,7 @@ def rename_offline_ckpt_suffix_to_fd_suffix(
|
|||||||
}
|
}
|
||||||
moe_quant_type = ""
|
moe_quant_type = ""
|
||||||
dense_quant_type = ""
|
dense_quant_type = ""
|
||||||
if fd_config.quant_config is None:
|
if fd_config.quant_config is not None:
|
||||||
if fd_config.quant_config.name() == "mix_quant":
|
if fd_config.quant_config.name() == "mix_quant":
|
||||||
moe_quant_type = fd_config.quant_config.moe_quant_type
|
moe_quant_type = fd_config.quant_config.moe_quant_type
|
||||||
dense_quant_type = fd_config.quant_config.dense_quant_type
|
dense_quant_type = fd_config.quant_config.dense_quant_type
|
||||||
|
@@ -44,7 +44,7 @@ from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
|||||||
from fastdeploy.inter_communicator import IPCSignal
|
from fastdeploy.inter_communicator import IPCSignal
|
||||||
from fastdeploy.model_executor.layers.quantization import parse_quant_config
|
from fastdeploy.model_executor.layers.quantization import parse_quant_config
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.utils import get_logger, parse_quantization
|
from fastdeploy.utils import get_logger
|
||||||
from fastdeploy.worker.worker_base import WorkerBase
|
from fastdeploy.worker.worker_base import WorkerBase
|
||||||
|
|
||||||
logger = get_logger("worker_process", "worker_process.log")
|
logger = get_logger("worker_process", "worker_process.log")
|
||||||
@@ -655,8 +655,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
|||||||
FDConfig: Initialized FastDeploy configuration object
|
FDConfig: Initialized FastDeploy configuration object
|
||||||
"""
|
"""
|
||||||
# RL rollout
|
# RL rollout
|
||||||
if args.quantization is not None and isinstance(args.quantization, str):
|
|
||||||
args.quantization = parse_quantization(args.quantization)
|
|
||||||
paddle.set_default_dtype(args.dtype)
|
paddle.set_default_dtype(args.dtype)
|
||||||
model_config = ModelConfig(vars(args))
|
model_config = ModelConfig(vars(args))
|
||||||
device_config = DeviceConfig(vars(args))
|
device_config = DeviceConfig(vars(args))
|
||||||
|
Reference in New Issue
Block a user