mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
Unify server-side and model-side Config (Part3) (#3047)
* merge model config * fix arch * fix rl
This commit is contained in:
@@ -73,7 +73,13 @@ class ErnieArchitectures:
|
|||||||
|
|
||||||
|
|
||||||
PRETRAINED_INIT_CONFIGURATION = {
|
PRETRAINED_INIT_CONFIGURATION = {
|
||||||
|
"top_p": 1.0,
|
||||||
|
"temperature": 1.0,
|
||||||
"rope_theta": 10000.0,
|
"rope_theta": 10000.0,
|
||||||
|
"penalty_score": 1.0,
|
||||||
|
"frequency_score": 0.0,
|
||||||
|
"presence_score": 0.0,
|
||||||
|
"min_length": 1,
|
||||||
"num_key_value_heads": -1,
|
"num_key_value_heads": -1,
|
||||||
"start_layer_index": 0,
|
"start_layer_index": 0,
|
||||||
"moe_num_shared_experts": 0,
|
"moe_num_shared_experts": 0,
|
||||||
@@ -101,16 +107,7 @@ class ModelConfig:
|
|||||||
self,
|
self,
|
||||||
args,
|
args,
|
||||||
):
|
):
|
||||||
# NOTE(gongshaotain): form _load_model_init_val()
|
self.model = ""
|
||||||
self.top_p = 1.0
|
|
||||||
self.temperature = 1.0
|
|
||||||
self.rope_theta = 10000.0
|
|
||||||
self.penalty_score = 1.0
|
|
||||||
self.frequency_score = 0.0
|
|
||||||
self.presence_score = 0.0
|
|
||||||
self.min_length = 1
|
|
||||||
self.model_name_or_path = ""
|
|
||||||
|
|
||||||
self.is_quantized = False
|
self.is_quantized = False
|
||||||
self.max_model_len = 0
|
self.max_model_len = 0
|
||||||
self.dtype = ""
|
self.dtype = ""
|
||||||
@@ -118,16 +115,13 @@ class ModelConfig:
|
|||||||
self.enable_mm = False
|
self.enable_mm = False
|
||||||
self.enable_redundant_experts = False
|
self.enable_redundant_experts = False
|
||||||
self.redundant_experts_num = 0
|
self.redundant_experts_num = 0
|
||||||
|
self.quantization = None
|
||||||
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
|
||||||
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
|
||||||
|
|
||||||
for key, value in args.items():
|
for key, value in args.items():
|
||||||
if hasattr(self, key):
|
if hasattr(self, key):
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
assert self.model_name_or_path != ""
|
assert self.model != ""
|
||||||
pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path)
|
pretrained_config, _ = PretrainedConfig.get_config_dict(self.model)
|
||||||
self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
|
self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
|
||||||
|
|
||||||
# set attribute from pretrained_config
|
# set attribute from pretrained_config
|
||||||
@@ -149,6 +143,64 @@ class ModelConfig:
|
|||||||
if ErnieArchitectures.contains_ernie_arch(self.architectures):
|
if ErnieArchitectures.contains_ernie_arch(self.architectures):
|
||||||
self.ori_vocab_size = args.get("ori_vocab_size", self.ori_vocab_size)
|
self.ori_vocab_size = args.get("ori_vocab_size", self.ori_vocab_size)
|
||||||
|
|
||||||
|
self.is_unified_ckpt = check_unified_ckpt(self.model)
|
||||||
|
|
||||||
|
self.override_name_from_config()
|
||||||
|
self.read_from_env()
|
||||||
|
|
||||||
|
def override_name_from_config(self):
|
||||||
|
"""
|
||||||
|
Override attribute names from the exported model's configuration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
|
||||||
|
self.tensor_parallel_size = self.infer_model_mp_num
|
||||||
|
del self.infer_model_mp_num
|
||||||
|
|
||||||
|
if hasattr(self, "num_hidden_layers"):
|
||||||
|
if hasattr(self, "remove_tail_layer"):
|
||||||
|
if self.remove_tail_layer is True:
|
||||||
|
self.num_hidden_layers -= 1
|
||||||
|
elif isinstance(self.remove_tail_layer, int):
|
||||||
|
self.num_hidden_layers -= self.remove_tail_layer
|
||||||
|
|
||||||
|
if not hasattr(self, "mla_use_absorb"):
|
||||||
|
self.mla_use_absorb = False
|
||||||
|
|
||||||
|
def read_from_env(self):
|
||||||
|
"""
|
||||||
|
Read configuration information from environment variables and update the object's attributes.
|
||||||
|
|
||||||
|
If an attribute is not present or is an empty string in the environment variables, use the default value.
|
||||||
|
"""
|
||||||
|
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
||||||
|
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
||||||
|
|
||||||
|
def reset_config_value(key, value):
|
||||||
|
if not hasattr(self, key.lower()):
|
||||||
|
if os.getenv(key, None):
|
||||||
|
value = eval(os.getenv(key))
|
||||||
|
logger.info(f"Get parameter `{key}` = {value} from environment.")
|
||||||
|
else:
|
||||||
|
logger.info(f"Parameter `{key}` will use default value {value}.")
|
||||||
|
setattr(self, key.lower(), value)
|
||||||
|
|
||||||
|
reset_config_value("COMPRESSION_RATIO", 1.0)
|
||||||
|
reset_config_value("ROPE_THETA", 10000)
|
||||||
|
|
||||||
|
def _get_download_model(self, model_name, model_type="default"):
|
||||||
|
# TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
"""
|
||||||
|
Print all configuration information.
|
||||||
|
"""
|
||||||
|
logger.info("Model Configuration Information :")
|
||||||
|
for k, v in self.__dict__.items():
|
||||||
|
logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||||
|
logger.info("=============================================================")
|
||||||
|
|
||||||
|
|
||||||
class ParallelConfig:
|
class ParallelConfig:
|
||||||
"""Configuration for the distributed execution."""
|
"""Configuration for the distributed execution."""
|
||||||
@@ -173,7 +225,6 @@ class ParallelConfig:
|
|||||||
From old wersion worker args
|
From old wersion worker args
|
||||||
TODO(gongshaotian): Reclassify
|
TODO(gongshaotian): Reclassify
|
||||||
"""
|
"""
|
||||||
self.model_name_or_path: str = "./output"
|
|
||||||
self.max_num_seqs: int = 34
|
self.max_num_seqs: int = 34
|
||||||
# Set default block num for profile run
|
# Set default block num for profile run
|
||||||
self.total_block_num: int = 2000
|
self.total_block_num: int = 2000
|
||||||
@@ -609,7 +660,7 @@ class CacheConfig:
|
|||||||
self.enable_hierarchical_cache = True
|
self.enable_hierarchical_cache = True
|
||||||
|
|
||||||
if self.model_cfg is not None:
|
if self.model_cfg is not None:
|
||||||
if hasattr(self.model_cfg, "quantization_config"):
|
if self.model_cfg.quantization_config is not None:
|
||||||
self.cache_dtype = self.model_cfg.quantization_config.get("kv_cache_quant_type", self.cache_dtype)
|
self.cache_dtype = self.model_cfg.quantization_config.get("kv_cache_quant_type", self.cache_dtype)
|
||||||
if (
|
if (
|
||||||
hasattr(self.model_cfg, "num_key_value_heads")
|
hasattr(self.model_cfg, "num_key_value_heads")
|
||||||
@@ -631,7 +682,7 @@ class CacheConfig:
|
|||||||
else:
|
else:
|
||||||
byte_size = 2
|
byte_size = 2
|
||||||
self.each_token_cache_space = int(
|
self.each_token_cache_space = int(
|
||||||
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
|
self.model_cfg.num_hidden_layers * kv_num_head * self.model_cfg.head_dim * byte_size
|
||||||
)
|
)
|
||||||
self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
|
self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
|
||||||
self.bytes_per_layer_per_block = int(
|
self.bytes_per_layer_per_block = int(
|
||||||
|
@@ -22,6 +22,7 @@ from typing import Any, Dict, List, Optional
|
|||||||
from fastdeploy.config import (
|
from fastdeploy.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
GraphOptimizationConfig,
|
GraphOptimizationConfig,
|
||||||
|
LoadConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
TaskOption,
|
TaskOption,
|
||||||
)
|
)
|
||||||
@@ -756,18 +757,6 @@ class EngineArgs:
|
|||||||
"""
|
"""
|
||||||
return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
|
return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
|
||||||
|
|
||||||
def create_model_config(self) -> ModelConfig:
|
|
||||||
"""
|
|
||||||
Create and return a ModelConfig object based on the current settings.
|
|
||||||
"""
|
|
||||||
return ModelConfig(
|
|
||||||
model_name_or_path=self.model,
|
|
||||||
config_json_file=self.model_config_name,
|
|
||||||
quantization=self.quantization,
|
|
||||||
dynamic_load_weight=self.dynamic_load_weight,
|
|
||||||
load_strategy=self.load_strategy,
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_speculative_config(self) -> SpeculativeConfig:
|
def create_speculative_config(self) -> SpeculativeConfig:
|
||||||
""" """
|
""" """
|
||||||
speculative_args = asdict(self)
|
speculative_args = asdict(self)
|
||||||
@@ -826,7 +815,12 @@ class EngineArgs:
|
|||||||
"""
|
"""
|
||||||
Create and return a Config object based on the current settings.
|
Create and return a Config object based on the current settings.
|
||||||
"""
|
"""
|
||||||
model_cfg = self.create_model_config()
|
all_dict = asdict(self)
|
||||||
|
model_cfg = ModelConfig(all_dict)
|
||||||
|
all_dict["model_cfg"] = model_cfg
|
||||||
|
cache_cfg = CacheConfig(all_dict)
|
||||||
|
load_cfg = LoadConfig(all_dict)
|
||||||
|
|
||||||
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
||||||
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
@@ -843,16 +837,13 @@ class EngineArgs:
|
|||||||
self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
|
self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
|
||||||
), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
|
), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
|
||||||
|
|
||||||
all_dict = asdict(self)
|
|
||||||
all_dict["model_cfg"] = model_cfg
|
|
||||||
cache_cfg = CacheConfig(all_dict)
|
|
||||||
|
|
||||||
return Config(
|
return Config(
|
||||||
model_name_or_path=self.model,
|
model_name_or_path=self.model,
|
||||||
model_config=model_cfg,
|
model_config=model_cfg,
|
||||||
scheduler_config=scheduler_cfg,
|
scheduler_config=scheduler_cfg,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
cache_config=cache_cfg,
|
cache_config=cache_cfg,
|
||||||
|
load_config=load_cfg,
|
||||||
parallel_config=self.create_parallel_config(),
|
parallel_config=self.create_parallel_config(),
|
||||||
max_model_len=self.max_model_len,
|
max_model_len=self.max_model_len,
|
||||||
tensor_parallel_size=self.tensor_parallel_size,
|
tensor_parallel_size=self.tensor_parallel_size,
|
||||||
|
@@ -19,141 +19,10 @@ from dataclasses import dataclass
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy.config import CacheConfig, LoadConfig, ModelConfig
|
||||||
from fastdeploy.config import CacheConfig
|
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.scheduler import SchedulerConfig
|
from fastdeploy.scheduler import SchedulerConfig
|
||||||
from fastdeploy.utils import (
|
from fastdeploy.utils import ceil_div, get_host_ip, is_port_available, llm_logger
|
||||||
ceil_div,
|
|
||||||
check_unified_ckpt,
|
|
||||||
get_host_ip,
|
|
||||||
is_port_available,
|
|
||||||
llm_logger,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ModelConfig:
|
|
||||||
"""
|
|
||||||
Configuration class for the model.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
model_dir (str): Directory path to the model.
|
|
||||||
is_unified_ckpt (bool): Flag indicating if the checkpoint is unified.
|
|
||||||
model_name_or_path (str): Name or path of the model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
model_name_or_path: str,
|
|
||||||
config_json_file: str = "config.json",
|
|
||||||
dynamic_load_weight: bool = False,
|
|
||||||
load_strategy: str = "ipc_snapshot",
|
|
||||||
quantization: str = None,
|
|
||||||
download_dir: Optional[str] = None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Initialize the ModelConfig class.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model_name_or_path (str): Name or path of the model.
|
|
||||||
config_json_file (str): Path to the configuration JSON file. Default is 'config.json'.
|
|
||||||
download_dir (Optional[str]): Directory to download model files. Default is None.
|
|
||||||
"""
|
|
||||||
self.model_dir = model_name_or_path
|
|
||||||
self.is_unified_ckpt = check_unified_ckpt(self.model_dir)
|
|
||||||
self.dynamic_load_weight = dynamic_load_weight
|
|
||||||
self.load_strategy = load_strategy
|
|
||||||
self.quantization = quantization
|
|
||||||
|
|
||||||
config_file = os.path.join(model_name_or_path, config_json_file)
|
|
||||||
if os.path.isfile(model_name_or_path):
|
|
||||||
try:
|
|
||||||
from paddleformers.transformers import AutoConfig
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model_name_or_path)
|
|
||||||
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
|
|
||||||
for key, value in config_dict.items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
except Exception:
|
|
||||||
llm_logger.error(
|
|
||||||
"Don't support the current model, you can use `paddleformers` to register your model."
|
|
||||||
)
|
|
||||||
raise ValueError(
|
|
||||||
"Don't support the current model, you can use `paddleformers` to register your model."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
with open(config_file, "r", encoding="utf-8") as f:
|
|
||||||
config_dict = json.load(f)
|
|
||||||
for key, value in config_dict.items():
|
|
||||||
try:
|
|
||||||
setattr(self, key, value)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if isinstance(self.architectures, list):
|
|
||||||
self.architectures = self.architectures[0]
|
|
||||||
self.model_name_or_path = model_name_or_path
|
|
||||||
self.override_name_from_config()
|
|
||||||
self.read_from_env()
|
|
||||||
|
|
||||||
def override_name_from_config(self):
|
|
||||||
"""
|
|
||||||
Override attribute names from the exported model's configuration.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
|
|
||||||
self.tensor_parallel_size = self.infer_model_mp_num
|
|
||||||
del self.infer_model_mp_num
|
|
||||||
|
|
||||||
if hasattr(self, "num_hidden_layers"):
|
|
||||||
if hasattr(self, "remove_tail_layer"):
|
|
||||||
if self.remove_tail_layer is True:
|
|
||||||
self.num_hidden_layers -= 1
|
|
||||||
elif isinstance(self.remove_tail_layer, int):
|
|
||||||
self.num_hidden_layers -= self.remove_tail_layer
|
|
||||||
|
|
||||||
self.num_layers = self.num_hidden_layers
|
|
||||||
del self.num_hidden_layers
|
|
||||||
|
|
||||||
if not hasattr(self, "mla_use_absorb"):
|
|
||||||
self.mla_use_absorb = False
|
|
||||||
if not hasattr(self, "head_dim"):
|
|
||||||
assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
|
|
||||||
self.head_dim = self.hidden_size // self.num_attention_heads
|
|
||||||
|
|
||||||
def read_from_env(self):
|
|
||||||
"""
|
|
||||||
Read configuration information from environment variables and update the object's attributes.
|
|
||||||
|
|
||||||
If an attribute is not present or is an empty string in the environment variables, use the default value.
|
|
||||||
"""
|
|
||||||
self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
|
|
||||||
self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
|
|
||||||
|
|
||||||
def reset_config_value(key, value):
|
|
||||||
if not hasattr(self, key.lower()):
|
|
||||||
if os.getenv(key, None):
|
|
||||||
value = eval(os.getenv(key))
|
|
||||||
llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
|
|
||||||
else:
|
|
||||||
llm_logger.info(f"Parameter `{key}` will use default value {value}.")
|
|
||||||
setattr(self, key.lower(), value)
|
|
||||||
|
|
||||||
reset_config_value("COMPRESSION_RATIO", 1.0)
|
|
||||||
reset_config_value("ROPE_THETA", 10000)
|
|
||||||
|
|
||||||
def _get_download_model(self, model_name, model_type="default"):
|
|
||||||
# TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
|
|
||||||
pass
|
|
||||||
|
|
||||||
def print(self):
|
|
||||||
"""
|
|
||||||
Print all configuration information.
|
|
||||||
"""
|
|
||||||
llm_logger.info("Model Configuration Information :")
|
|
||||||
for k, v in self.__dict__.items():
|
|
||||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
|
||||||
llm_logger.info("=============================================================")
|
|
||||||
|
|
||||||
|
|
||||||
class ParallelConfig:
|
class ParallelConfig:
|
||||||
@@ -288,6 +157,7 @@ class Config:
|
|||||||
cache_config: CacheConfig,
|
cache_config: CacheConfig,
|
||||||
scheduler_config: SchedulerConfig,
|
scheduler_config: SchedulerConfig,
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
|
load_config: LoadConfig,
|
||||||
commit_config: CommitConfig = CommitConfig(),
|
commit_config: CommitConfig = CommitConfig(),
|
||||||
model_name_or_path: str = None,
|
model_name_or_path: str = None,
|
||||||
tokenizer: str = None,
|
tokenizer: str = None,
|
||||||
@@ -345,6 +215,7 @@ class Config:
|
|||||||
self.cache_config = cache_config
|
self.cache_config = cache_config
|
||||||
self.scheduler_config = scheduler_config
|
self.scheduler_config = scheduler_config
|
||||||
self.parallel_config = parallel_config
|
self.parallel_config = parallel_config
|
||||||
|
self.load_config = load_config
|
||||||
self.commit_config = commit_config
|
self.commit_config = commit_config
|
||||||
self.model_name_or_path = model_name_or_path
|
self.model_name_or_path = model_name_or_path
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
@@ -1064,7 +1064,7 @@ class LLMEngine:
|
|||||||
f" --devices {self.cfg.device_ids} {py_script}"
|
f" --devices {self.cfg.device_ids} {py_script}"
|
||||||
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
|
||||||
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
|
||||||
f" --model_name_or_path {self.cfg.model_name_or_path!s}"
|
f" --model {self.cfg.model_name_or_path!s}"
|
||||||
f" --device_ids {self.cfg.device_ids}"
|
f" --device_ids {self.cfg.device_ids}"
|
||||||
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
|
f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
|
||||||
f" --engine_worker_queue_port {self.cfg.engine_worker_queue_port!s}"
|
f" --engine_worker_queue_port {self.cfg.engine_worker_queue_port!s}"
|
||||||
@@ -1084,7 +1084,7 @@ class LLMEngine:
|
|||||||
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
|
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
|
||||||
f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
|
f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
|
||||||
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
|
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
|
||||||
f" --load_strategy {self.cfg.model_config.load_strategy}"
|
f" --load_strategy {self.cfg.load_config.load_strategy}"
|
||||||
)
|
)
|
||||||
|
|
||||||
worker_append_flag = {
|
worker_append_flag = {
|
||||||
@@ -1092,7 +1092,7 @@ class LLMEngine:
|
|||||||
"enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
|
"enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
|
||||||
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
|
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
|
||||||
"do_profile": self.do_profile,
|
"do_profile": self.do_profile,
|
||||||
"dynamic_load_weight": self.cfg.model_config.dynamic_load_weight,
|
"dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
|
||||||
"disable_any_whitespace": self.cfg.disable_any_whitespace,
|
"disable_any_whitespace": self.cfg.disable_any_whitespace,
|
||||||
"enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
|
"enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
|
||||||
"enable_logprob": self.cfg.enable_logprob,
|
"enable_logprob": self.cfg.enable_logprob,
|
||||||
@@ -1231,9 +1231,9 @@ class LLMEngine:
|
|||||||
elif (match := re.search(r"Start load layer (\d+)", line)) or (
|
elif (match := re.search(r"Start load layer (\d+)", line)) or (
|
||||||
match := re.search(r"set state for layer (\d+)", line)
|
match := re.search(r"set state for layer (\d+)", line)
|
||||||
):
|
):
|
||||||
progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_layers
|
progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_hidden_layers
|
||||||
self.worker_init_status["layer_loadding"] = progress
|
self.worker_init_status["layer_loadding"] = progress
|
||||||
if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_layers - 1:
|
if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_hidden_layers - 1:
|
||||||
self.worker_init_status["finished"] = True
|
self.worker_init_status["finished"] = True
|
||||||
|
|
||||||
self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True)
|
self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True)
|
||||||
|
@@ -70,7 +70,7 @@ class InputPreprocessor:
|
|||||||
reasoning_parser_obj = None
|
reasoning_parser_obj = None
|
||||||
if self.reasoning_parser:
|
if self.reasoning_parser:
|
||||||
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
|
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
|
||||||
architectures = ModelConfig(self.model_name_or_path).architectures
|
architectures = ModelConfig({"model": self.model_name_or_path}).architectures[0]
|
||||||
if not self.enable_mm:
|
if not self.enable_mm:
|
||||||
if not ErnieArchitectures.contains_ernie_arch(architectures):
|
if not ErnieArchitectures.contains_ernie_arch(architectures):
|
||||||
from fastdeploy.input.text_processor import DataProcessor
|
from fastdeploy.input.text_processor import DataProcessor
|
||||||
|
@@ -270,7 +270,7 @@ class BackendBase:
|
|||||||
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
use_fast=False,
|
use_fast=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -289,14 +289,14 @@ class BackendBase:
|
|||||||
for i in range(len(vocab_file_names)):
|
for i in range(len(vocab_file_names)):
|
||||||
if os.path.exists(
|
if os.path.exists(
|
||||||
os.path.join(
|
os.path.join(
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
vocab_file_names[i],
|
vocab_file_names[i],
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
|
ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
|
||||||
break
|
break
|
||||||
|
|
||||||
tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.parallel_config.model_name_or_path)
|
tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.model_config.model)
|
||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@@ -85,18 +85,22 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
|||||||
|
|
||||||
up_gate_proj_weight_scale.append(
|
up_gate_proj_weight_scale.append(
|
||||||
get_tensor(
|
get_tensor(
|
||||||
|
(
|
||||||
state_dict.pop(up_gate_proj_expert_weight_scale_key_name)
|
state_dict.pop(up_gate_proj_expert_weight_scale_key_name)
|
||||||
if up_gate_proj_expert_weight_scale_key_name in state_dict
|
if up_gate_proj_expert_weight_scale_key_name in state_dict
|
||||||
else up_gate_proj_expert_weight_scale_key_name,
|
else up_gate_proj_expert_weight_scale_key_name
|
||||||
layer.fd_config.parallel_config.model_name_or_path,
|
),
|
||||||
|
layer.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
down_proj_weight_scale.append(
|
down_proj_weight_scale.append(
|
||||||
get_tensor(
|
get_tensor(
|
||||||
|
(
|
||||||
state_dict.pop(down_proj_expert_weight_scale_key_name)
|
state_dict.pop(down_proj_expert_weight_scale_key_name)
|
||||||
if down_proj_expert_weight_scale_key_name in state_dict
|
if down_proj_expert_weight_scale_key_name in state_dict
|
||||||
else down_proj_expert_weight_scale_key_name,
|
else down_proj_expert_weight_scale_key_name
|
||||||
layer.fd_config.parallel_config.model_name_or_path,
|
),
|
||||||
|
layer.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -265,7 +265,7 @@ class FusedMoE(nn.Layer):
|
|||||||
if up_gate_proj_expert_weight_key_name in state_dict
|
if up_gate_proj_expert_weight_key_name in state_dict
|
||||||
else up_gate_proj_expert_weight_key_name
|
else up_gate_proj_expert_weight_key_name
|
||||||
),
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
down_proj_weights.append(
|
down_proj_weights.append(
|
||||||
@@ -275,7 +275,7 @@ class FusedMoE(nn.Layer):
|
|||||||
if down_proj_expert_weight_key_name in state_dict
|
if down_proj_expert_weight_key_name in state_dict
|
||||||
else down_proj_expert_weight_key_name
|
else down_proj_expert_weight_key_name
|
||||||
),
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -291,7 +291,7 @@ class FusedMoE(nn.Layer):
|
|||||||
if gate_expert_weight_key_name in state_dict
|
if gate_expert_weight_key_name in state_dict
|
||||||
else gate_expert_weight_key_name
|
else gate_expert_weight_key_name
|
||||||
),
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
up = get_tensor(
|
up = get_tensor(
|
||||||
(
|
(
|
||||||
@@ -299,7 +299,7 @@ class FusedMoE(nn.Layer):
|
|||||||
if up_expert_weight_key_name in state_dict
|
if up_expert_weight_key_name in state_dict
|
||||||
else up_expert_weight_key_name
|
else up_expert_weight_key_name
|
||||||
),
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
|
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
|
||||||
down_proj_weights.append(
|
down_proj_weights.append(
|
||||||
@@ -309,7 +309,7 @@ class FusedMoE(nn.Layer):
|
|||||||
if down_proj_expert_weight_key_name in state_dict
|
if down_proj_expert_weight_key_name in state_dict
|
||||||
else down_proj_expert_weight_key_name
|
else down_proj_expert_weight_key_name
|
||||||
),
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.model_config.model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return up_gate_proj_weights, down_proj_weights, logical_expert_ids
|
return up_gate_proj_weights, down_proj_weights, logical_expert_ids
|
||||||
|
@@ -111,7 +111,7 @@ class DefaultModelLoader(BaseModelLoader):
|
|||||||
# TODO(gongshaotian): Now, only support safetensor
|
# TODO(gongshaotian): Now, only support safetensor
|
||||||
model_class = MODEL_CLASSES[architectures]
|
model_class = MODEL_CLASSES[architectures]
|
||||||
state_dict = load_composite_checkpoint(
|
state_dict = load_composite_checkpoint(
|
||||||
fd_config.parallel_config.model_name_or_path,
|
fd_config.model_config.model,
|
||||||
model_class,
|
model_class,
|
||||||
fd_config,
|
fd_config,
|
||||||
return_numpy=True,
|
return_numpy=True,
|
||||||
|
@@ -82,7 +82,7 @@ class DynamicWeightManager:
|
|||||||
def _update_ipc_snapshot(self):
|
def _update_ipc_snapshot(self):
|
||||||
"""Update using IPC snapshot strategy for elastic recovery."""
|
"""Update using IPC snapshot strategy for elastic recovery."""
|
||||||
model_path = os.path.join(
|
model_path = os.path.join(
|
||||||
self.parallel_config.model_name_or_path,
|
self.model_config.model,
|
||||||
f"model_state.tp0{self.meta_src_id}.pdparams",
|
f"model_state.tp0{self.meta_src_id}.pdparams",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -60,7 +60,7 @@ class RolloutModelConfig:
|
|||||||
local_rank: int = 0,
|
local_rank: int = 0,
|
||||||
):
|
):
|
||||||
# Required parameters
|
# Required parameters
|
||||||
self.model_name_or_path = model_name_or_path
|
self.model = model_name_or_path
|
||||||
self.max_model_len = max_model_len
|
self.max_model_len = max_model_len
|
||||||
self.tensor_parallel_size = tensor_parallel_size
|
self.tensor_parallel_size = tensor_parallel_size
|
||||||
self.dynamic_load_weight = dynamic_load_weight
|
self.dynamic_load_weight = dynamic_load_weight
|
||||||
|
@@ -73,7 +73,7 @@ class MTPProposer(Proposer):
|
|||||||
self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM"
|
self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM"
|
||||||
self.speculative_config.sharing_model = main_model
|
self.speculative_config.sharing_model = main_model
|
||||||
self.model_config.num_hidden_layers = 1
|
self.model_config.num_hidden_layers = 1
|
||||||
self.parallel_config.model_name_or_path = self.speculative_config.model
|
self.model_config.model = self.speculative_config.model
|
||||||
self.model_config.pretrained_config.prefix_name = "ernie.mtp_block"
|
self.model_config.pretrained_config.prefix_name = "ernie.mtp_block"
|
||||||
if self.speculative_config.quantization != "":
|
if self.speculative_config.quantization != "":
|
||||||
self.model_config.quantization = self.speculative_config.quantization
|
self.model_config.quantization = self.speculative_config.quantization
|
||||||
|
@@ -1484,8 +1484,8 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
|
|
||||||
def _init_image_preprocess(self) -> None:
|
def _init_image_preprocess(self) -> None:
|
||||||
processor = DataProcessor(
|
processor = DataProcessor(
|
||||||
tokenizer_name=self.parallel_config.model_name_or_path,
|
tokenizer_name=self.model_config.model,
|
||||||
image_preprocessor_name=str(self.parallel_config.model_name_or_path),
|
image_preprocessor_name=str(self.model_config.model),
|
||||||
)
|
)
|
||||||
processor.eval()
|
processor.eval()
|
||||||
image_preprocess = processor.image_preprocessor
|
image_preprocess = processor.image_preprocessor
|
||||||
|
@@ -101,7 +101,7 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
|
|||||||
def update_fd_config_for_mm(fd_config: FDConfig) -> None:
|
def update_fd_config_for_mm(fd_config: FDConfig) -> None:
|
||||||
if fd_config.model_config.enable_mm:
|
if fd_config.model_config.enable_mm:
|
||||||
tokenizer = ErnieBotTokenizer.from_pretrained(
|
tokenizer = ErnieBotTokenizer.from_pretrained(
|
||||||
fd_config.parallel_config.model_name_or_path,
|
fd_config.model_config.model,
|
||||||
model_max_length=fd_config.parallel_config.max_model_len,
|
model_max_length=fd_config.parallel_config.max_model_len,
|
||||||
padding_side="right",
|
padding_side="right",
|
||||||
use_fast=False,
|
use_fast=False,
|
||||||
@@ -439,7 +439,7 @@ def parse_args():
|
|||||||
parser = argparse.ArgumentParser("FastDeploy LLM Inference")
|
parser = argparse.ArgumentParser("FastDeploy LLM Inference")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-m",
|
"-m",
|
||||||
"--model_name_or_path",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
default="./output",
|
default="./output",
|
||||||
help="model dir",
|
help="model dir",
|
||||||
|
Reference in New Issue
Block a user