mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[v1 loader]qwen Offline fp8 (#4036)
* support offline fp8 * update ut * update ut * update ut * fix * update * update
This commit is contained in:
@@ -42,7 +42,7 @@ from fastdeploy.config import (
|
||||
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.model_executor.layers.quantization import get_quantization_config
|
||||
from fastdeploy.model_executor.layers.quantization import parse_quant_config
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import get_logger, parse_quantization
|
||||
from fastdeploy.worker.worker_base import WorkerBase
|
||||
@@ -698,50 +698,12 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
if getattr(model_config, "num_hidden_layers", None) is None:
|
||||
raise ValueError("num_hidden_layers is None")
|
||||
|
||||
quantization_config = model_config.quantization_config
|
||||
if not model_config.is_quantized:
|
||||
if quantization_config is not None:
|
||||
if "is_quantized" in quantization_config:
|
||||
model_config.is_quantized = quantization_config["is_quantized"]
|
||||
elif "kv_cache_quant_type" not in quantization_config:
|
||||
model_config.is_quantized = True
|
||||
|
||||
quant_config_name = None
|
||||
if quantization_config is not None and quantization_config.get("quantization", None) is None:
|
||||
raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.")
|
||||
|
||||
if quantization_config is not None:
|
||||
quant_config_name = quantization_config["quantization"]
|
||||
# TODO(YuanRisheng) is_checkpoint_bf16 may need to be removed and replaced by is_quantized in future
|
||||
if "kv_cache_quant_type" in quantization_config and load_config.load_choices == "default_v1":
|
||||
quantization_config["is_checkpoint_bf16"] = True
|
||||
|
||||
elif args.quantization is not None:
|
||||
quantization_config = {}
|
||||
try:
|
||||
quantization_config.update(args.quantization)
|
||||
quant_config_name = quantization_config["quantization"]
|
||||
except:
|
||||
quant_config_name = args.quantization["quantization"]
|
||||
quantization_config["quantization"] = quant_config_name
|
||||
# Only v1 loader sets is_checkpoint_bf16=True during dynamic quantization.
|
||||
if load_config.load_choices == "default_v1":
|
||||
quantization_config["is_checkpoint_bf16"] = True
|
||||
# Special handling for Ernie models
|
||||
is_ernie = ErnieArchitectures.contains_ernie_arch(model_config.architectures)
|
||||
if quant_config_name == "wint4" and is_ernie:
|
||||
quantization_config["dense_quant_type"] = "wint8"
|
||||
quantization_config["moe_quant_type"] = "wint4"
|
||||
quantization_config["quantization"] = "mix_quant"
|
||||
quant_config_name = "mix_quant"
|
||||
else:
|
||||
quant_config_name = None
|
||||
|
||||
if quant_config_name is None:
|
||||
quant_config = None
|
||||
else:
|
||||
quant_cls = get_quantization_config(quant_config_name)
|
||||
quant_config = quant_cls.from_config(quantization_config)
|
||||
quant_config = parse_quant_config(
|
||||
args,
|
||||
model_config,
|
||||
is_ernie=ErnieArchitectures.contains_ernie_arch(model_config.architectures),
|
||||
is_v1_loader=load_config.load_choices == "default_v1",
|
||||
)
|
||||
|
||||
# Log quantization info
|
||||
logger.info("===========quantization_config==============")
|
||||
@@ -751,7 +713,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
else:
|
||||
logger.info("Model Status: Original (will apply online quantization)")
|
||||
|
||||
logger.info(f"{quantization_config}")
|
||||
logger.info(f"{model_config.quantization_config}")
|
||||
else:
|
||||
logger.info("No quantization config found and use original weight and act dtype.")
|
||||
|
||||
|
Reference in New Issue
Block a user