[v1 loader]qwen Offline fp8 (#4036)

* support offline fp8

* update ut

* update ut

* update ut

* fix

* update

* update
This commit is contained in:
bukejiyu
2025-09-15 13:44:11 +08:00
committed by GitHub
parent b1a5b756a3
commit 29ed617f0f
21 changed files with 440 additions and 138 deletions

View File

@@ -42,7 +42,7 @@ from fastdeploy.config import (
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import get_quantization_config
from fastdeploy.model_executor.layers.quantization import parse_quant_config
from fastdeploy.platforms import current_platform
from fastdeploy.utils import get_logger, parse_quantization
from fastdeploy.worker.worker_base import WorkerBase
@@ -698,50 +698,12 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
if getattr(model_config, "num_hidden_layers", None) is None:
raise ValueError("num_hidden_layers is None")
quantization_config = model_config.quantization_config
if not model_config.is_quantized:
if quantization_config is not None:
if "is_quantized" in quantization_config:
model_config.is_quantized = quantization_config["is_quantized"]
elif "kv_cache_quant_type" not in quantization_config:
model_config.is_quantized = True
quant_config_name = None
if quantization_config is not None and quantization_config.get("quantization", None) is None:
raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.")
if quantization_config is not None:
quant_config_name = quantization_config["quantization"]
# TODO(YuanRisheng) is_checkpoint_bf16 may need to be removed and replaced by is_quantized in future
if "kv_cache_quant_type" in quantization_config and load_config.load_choices == "default_v1":
quantization_config["is_checkpoint_bf16"] = True
elif args.quantization is not None:
quantization_config = {}
try:
quantization_config.update(args.quantization)
quant_config_name = quantization_config["quantization"]
except:
quant_config_name = args.quantization["quantization"]
quantization_config["quantization"] = quant_config_name
# Only v1 loader sets is_checkpoint_bf16=True during dynamic quantization.
if load_config.load_choices == "default_v1":
quantization_config["is_checkpoint_bf16"] = True
# Special handling for Ernie models
is_ernie = ErnieArchitectures.contains_ernie_arch(model_config.architectures)
if quant_config_name == "wint4" and is_ernie:
quantization_config["dense_quant_type"] = "wint8"
quantization_config["moe_quant_type"] = "wint4"
quantization_config["quantization"] = "mix_quant"
quant_config_name = "mix_quant"
else:
quant_config_name = None
if quant_config_name is None:
quant_config = None
else:
quant_cls = get_quantization_config(quant_config_name)
quant_config = quant_cls.from_config(quantization_config)
quant_config = parse_quant_config(
args,
model_config,
is_ernie=ErnieArchitectures.contains_ernie_arch(model_config.architectures),
is_v1_loader=load_config.load_choices == "default_v1",
)
# Log quantization info
logger.info("===========quantization_config==============")
@@ -751,7 +713,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
else:
logger.info("Model Status: Original (will apply online quantization)")
logger.info(f"{quantization_config}")
logger.info(f"{model_config.quantization_config}")
else:
logger.info("No quantization config found and use original weight and act dtype.")