Unify server-side and model-side Config (Part3) (#3047)

* merge model config * fix arch * fix rl
2025-10-05 16:48:03 +08:00 · 2025-07-29 17:07:44 +08:00
parent 907d561523
commit 502ee92a0a
14 changed files with 116 additions and 199 deletions
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -73,7 +73,13 @@ class ErnieArchitectures:
 PRETRAINED_INIT_CONFIGURATION = {
    "top_p": 1.0,
    "temperature": 1.0,
    "rope_theta": 10000.0,
    "penalty_score": 1.0,
    "frequency_score": 0.0,
    "presence_score": 0.0,
    "min_length": 1,
    "num_key_value_heads": -1,
    "start_layer_index": 0,
    "moe_num_shared_experts": 0,
@@ -101,16 +107,7 @@ class ModelConfig:
        self,
        args,
    ):
-        # NOTE(gongshaotain): form _load_model_init_val()
+        self.model = ""
        self.top_p = 1.0
        self.temperature = 1.0
        self.rope_theta = 10000.0
        self.penalty_score = 1.0
        self.frequency_score = 0.0
        self.presence_score = 0.0
        self.min_length = 1
        self.model_name_or_path = ""
        self.is_quantized = False
        self.max_model_len = 0
        self.dtype = ""
@@ -118,16 +115,13 @@ class ModelConfig:
        self.enable_mm = False
        self.enable_redundant_experts = False
        self.redundant_experts_num = 0
-
+        self.quantization = None
        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)
-        assert self.model_name_or_path != ""
+        assert self.model != ""
-        pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path)
+        pretrained_config, _ = PretrainedConfig.get_config_dict(self.model)
        self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
        # set attribute from pretrained_config
@@ -149,6 +143,64 @@ class ModelConfig:
        if ErnieArchitectures.contains_ernie_arch(self.architectures):
            self.ori_vocab_size = args.get("ori_vocab_size", self.ori_vocab_size)
        self.is_unified_ckpt = check_unified_ckpt(self.model)
        self.override_name_from_config()
        self.read_from_env()
    def override_name_from_config(self):
        """
        Override attribute names from the exported model's configuration.
        """
        if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
            self.tensor_parallel_size = self.infer_model_mp_num
            del self.infer_model_mp_num
        if hasattr(self, "num_hidden_layers"):
            if hasattr(self, "remove_tail_layer"):
                if self.remove_tail_layer is True:
                    self.num_hidden_layers -= 1
                elif isinstance(self.remove_tail_layer, int):
                    self.num_hidden_layers -= self.remove_tail_layer
        if not hasattr(self, "mla_use_absorb"):
            self.mla_use_absorb = False
    def read_from_env(self):
        """
        Read configuration information from environment variables and update the object's attributes.
        If an attribute is not present or is an empty string in the environment variables, use the default value.
        """
        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
        def reset_config_value(key, value):
            if not hasattr(self, key.lower()):
                if os.getenv(key, None):
                    value = eval(os.getenv(key))
                    logger.info(f"Get parameter `{key}` = {value} from environment.")
                else:
                    logger.info(f"Parameter `{key}` will use default value {value}.")
                setattr(self, key.lower(), value)
        reset_config_value("COMPRESSION_RATIO", 1.0)
        reset_config_value("ROPE_THETA", 10000)
    def _get_download_model(self, model_name, model_type="default"):
        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
        pass
    def print(self):
        """
        Print all configuration information.
        """
        logger.info("Model Configuration Information :")
        for k, v in self.__dict__.items():
            logger.info("{:<20}:{:<6}{}".format(k, "", v))
        logger.info("=============================================================")
 class ParallelConfig:
    """Configuration for the distributed execution."""
@@ -173,7 +225,6 @@ class ParallelConfig:
        From old wersion worker args
        TODO(gongshaotian): Reclassify
        """
        self.model_name_or_path: str = "./output"
        self.max_num_seqs: int = 34
        # Set default block num for profile run
        self.total_block_num: int = 2000
@@ -609,7 +660,7 @@ class CacheConfig:
            self.enable_hierarchical_cache = True
        if self.model_cfg is not None:
-            if hasattr(self.model_cfg, "quantization_config"):
+            if self.model_cfg.quantization_config is not None:
                self.cache_dtype = self.model_cfg.quantization_config.get("kv_cache_quant_type", self.cache_dtype)
            if (
                hasattr(self.model_cfg, "num_key_value_heads")
@@ -631,7 +682,7 @@ class CacheConfig:
            else:
                byte_size = 2
            self.each_token_cache_space = int(
-                self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
+                self.model_cfg.num_hidden_layers * kv_num_head * self.model_cfg.head_dim * byte_size
            )
            self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
            self.bytes_per_layer_per_block = int(
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -22,6 +22,7 @@ from typing import Any, Dict, List, Optional
 from fastdeploy.config import (
    CacheConfig,
    GraphOptimizationConfig,
    LoadConfig,
    SpeculativeConfig,
    TaskOption,
 )
@@ -756,18 +757,6 @@ class EngineArgs:
        """
        return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
    def create_model_config(self) -> ModelConfig:
        """
        Create and return a ModelConfig object based on the current settings.
        """
        return ModelConfig(
            model_name_or_path=self.model,
            config_json_file=self.model_config_name,
            quantization=self.quantization,
            dynamic_load_weight=self.dynamic_load_weight,
            load_strategy=self.load_strategy,
        )
    def create_speculative_config(self) -> SpeculativeConfig:
        """ """
        speculative_args = asdict(self)
@@ -826,7 +815,12 @@ class EngineArgs:
        """
        Create and return a Config object based on the current settings.
        """
-        model_cfg = self.create_model_config()
+        all_dict = asdict(self)
        model_cfg = ModelConfig(all_dict)
        all_dict["model_cfg"] = model_cfg
        cache_cfg = CacheConfig(all_dict)
        load_cfg = LoadConfig(all_dict)
        if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
            self.tensor_parallel_size = model_cfg.tensor_parallel_size
        if self.max_num_batched_tokens is None:
@@ -843,16 +837,13 @@ class EngineArgs:
            self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
        ), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
        all_dict = asdict(self)
        all_dict["model_cfg"] = model_cfg
        cache_cfg = CacheConfig(all_dict)
        return Config(
            model_name_or_path=self.model,
            model_config=model_cfg,
            scheduler_config=scheduler_cfg,
            tokenizer=self.tokenizer,
            cache_config=cache_cfg,
            load_config=load_cfg,
            parallel_config=self.create_parallel_config(),
            max_model_len=self.max_model_len,
            tensor_parallel_size=self.tensor_parallel_size,
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -19,141 +19,10 @@ from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Dict, List, Optional
-from fastdeploy import envs
+from fastdeploy.config import CacheConfig, LoadConfig, ModelConfig
 from fastdeploy.config import CacheConfig
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
-from fastdeploy.utils import (
+from fastdeploy.utils import ceil_div, get_host_ip, is_port_available, llm_logger
    ceil_div,
    check_unified_ckpt,
    get_host_ip,
    is_port_available,
    llm_logger,
 )
 class ModelConfig:
    """
    Configuration class for the model.
    Attributes:
        model_dir (str): Directory path to the model.
        is_unified_ckpt (bool): Flag indicating if the checkpoint is unified.
        model_name_or_path (str): Name or path of the model.
    """
    def __init__(
        self,
        model_name_or_path: str,
        config_json_file: str = "config.json",
        dynamic_load_weight: bool = False,
        load_strategy: str = "ipc_snapshot",
        quantization: str = None,
        download_dir: Optional[str] = None,
    ):
        """
        Initialize the ModelConfig class.
        Args:
            model_name_or_path (str): Name or path of the model.
            config_json_file (str): Path to the configuration JSON file. Default is 'config.json'.
            download_dir (Optional[str]): Directory to download model files. Default is None.
        """
        self.model_dir = model_name_or_path
        self.is_unified_ckpt = check_unified_ckpt(self.model_dir)
        self.dynamic_load_weight = dynamic_load_weight
        self.load_strategy = load_strategy
        self.quantization = quantization
        config_file = os.path.join(model_name_or_path, config_json_file)
        if os.path.isfile(model_name_or_path):
            try:
                from paddleformers.transformers import AutoConfig
                config = AutoConfig.from_pretrained(model_name_or_path)
                config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
                for key, value in config_dict.items():
                    setattr(self, key, value)
            except Exception:
                llm_logger.error(
                    "Don't support the current model, you can use `paddleformers` to register your model."
                )
                raise ValueError(
                    "Don't support the current model, you can use `paddleformers` to register your model."
                )
        else:
            with open(config_file, "r", encoding="utf-8") as f:
                config_dict = json.load(f)
                for key, value in config_dict.items():
                    try:
                        setattr(self, key, value)
                    except Exception:
                        continue
        if isinstance(self.architectures, list):
            self.architectures = self.architectures[0]
        self.model_name_or_path = model_name_or_path
        self.override_name_from_config()
        self.read_from_env()
    def override_name_from_config(self):
        """
        Override attribute names from the exported model's configuration.
        """
        if not self.is_unified_ckpt and hasattr(self, "infer_model_mp_num"):
            self.tensor_parallel_size = self.infer_model_mp_num
            del self.infer_model_mp_num
        if hasattr(self, "num_hidden_layers"):
            if hasattr(self, "remove_tail_layer"):
                if self.remove_tail_layer is True:
                    self.num_hidden_layers -= 1
                elif isinstance(self.remove_tail_layer, int):
                    self.num_hidden_layers -= self.remove_tail_layer
            self.num_layers = self.num_hidden_layers
            del self.num_hidden_layers
        if not hasattr(self, "mla_use_absorb"):
            self.mla_use_absorb = False
        if not hasattr(self, "head_dim"):
            assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
            self.head_dim = self.hidden_size // self.num_attention_heads
    def read_from_env(self):
        """
        Read configuration information from environment variables and update the object's attributes.
        If an attribute is not present or is an empty string in the environment variables, use the default value.
        """
        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
        self.stop_seqs_max_len = int(envs.FD_STOP_SEQS_MAX_LEN)
        def reset_config_value(key, value):
            if not hasattr(self, key.lower()):
                if os.getenv(key, None):
                    value = eval(os.getenv(key))
                    llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
                else:
                    llm_logger.info(f"Parameter `{key}` will use default value {value}.")
                setattr(self, key.lower(), value)
        reset_config_value("COMPRESSION_RATIO", 1.0)
        reset_config_value("ROPE_THETA", 10000)
    def _get_download_model(self, model_name, model_type="default"):
        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
        pass
    def print(self):
        """
        Print all configuration information.
        """
        llm_logger.info("Model Configuration Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
        llm_logger.info("=============================================================")
 class ParallelConfig:
@@ -288,6 +157,7 @@ class Config:
        cache_config: CacheConfig,
        scheduler_config: SchedulerConfig,
        parallel_config: ParallelConfig,
        load_config: LoadConfig,
        commit_config: CommitConfig = CommitConfig(),
        model_name_or_path: str = None,
        tokenizer: str = None,
@@ -345,6 +215,7 @@ class Config:
        self.cache_config = cache_config
        self.scheduler_config = scheduler_config
        self.parallel_config = parallel_config
        self.load_config = load_config
        self.commit_config = commit_config
        self.model_name_or_path = model_name_or_path
        self.tokenizer = tokenizer
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -1064,7 +1064,7 @@ class LLMEngine:
            f" --devices {self.cfg.device_ids} {py_script}"
            f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
            f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
-            f" --model_name_or_path {self.cfg.model_name_or_path!s}"
+            f" --model {self.cfg.model_name_or_path!s}"
            f" --device_ids {self.cfg.device_ids}"
            f" --tensor_parallel_size {self.cfg.tensor_parallel_size}"
            f" --engine_worker_queue_port {self.cfg.engine_worker_queue_port!s}"
@@ -1084,7 +1084,7 @@ class LLMEngine:
            f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
            f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'"
            f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
-            f" --load_strategy {self.cfg.model_config.load_strategy}"
+            f" --load_strategy {self.cfg.load_config.load_strategy}"
        )
        worker_append_flag = {
@@ -1092,7 +1092,7 @@ class LLMEngine:
            "enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
            "enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
            "do_profile": self.do_profile,
-            "dynamic_load_weight": self.cfg.model_config.dynamic_load_weight,
+            "dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
            "disable_any_whitespace": self.cfg.disable_any_whitespace,
            "enable_custom_all_reduce": self.cfg.parallel_config.enable_custom_all_reduce,
            "enable_logprob": self.cfg.enable_logprob,
@@ -1231,9 +1231,9 @@ class LLMEngine:
                elif (match := re.search(r"Start load layer (\d+)", line)) or (
                    match := re.search(r"set state for layer (\d+)", line)
                ):
-                    progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_layers
+                    progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_hidden_layers
                    self.worker_init_status["layer_loadding"] = progress
-                    if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_layers - 1:
+                    if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_hidden_layers - 1:
                        self.worker_init_status["finished"] = True
        self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True)
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -70,7 +70,7 @@ class InputPreprocessor:
        reasoning_parser_obj = None
        if self.reasoning_parser:
            reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser)
-        architectures = ModelConfig(self.model_name_or_path).architectures
+        architectures = ModelConfig({"model": self.model_name_or_path}).architectures[0]
        if not self.enable_mm:
            if not ErnieArchitectures.contains_ernie_arch(architectures):
                from fastdeploy.input.text_processor import DataProcessor
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -270,7 +270,7 @@ class BackendBase:
                from transformers import AutoTokenizer, PreTrainedTokenizerFast
                tokenizer = AutoTokenizer.from_pretrained(
-                    self.fd_config.parallel_config.model_name_or_path,
+                    self.fd_config.model_config.model,
                    use_fast=False,
                )
@@ -289,14 +289,14 @@ class BackendBase:
                for i in range(len(vocab_file_names)):
                    if os.path.exists(
                        os.path.join(
-                            self.fd_config.parallel_config.model_name_or_path,
+                            self.fd_config.model_config.model,
                            vocab_file_names[i],
                        )
                    ):
                        ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i]
                        break
-                tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.parallel_config.model_name_or_path)
+                tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.model_config.model)
            return tokenizer
        except Exception as e:
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -85,18 +85,22 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
            up_gate_proj_weight_scale.append(
                get_tensor(
                    (
                        state_dict.pop(up_gate_proj_expert_weight_scale_key_name)
                        if up_gate_proj_expert_weight_scale_key_name in state_dict
-                    else up_gate_proj_expert_weight_scale_key_name,
+                        else up_gate_proj_expert_weight_scale_key_name
-                    layer.fd_config.parallel_config.model_name_or_path,
+                    ),
                    layer.fd_config.model_config.model,
                )
            )
            down_proj_weight_scale.append(
                get_tensor(
                    (
                        state_dict.pop(down_proj_expert_weight_scale_key_name)
                        if down_proj_expert_weight_scale_key_name in state_dict
-                    else down_proj_expert_weight_scale_key_name,
+                        else down_proj_expert_weight_scale_key_name
-                    layer.fd_config.parallel_config.model_name_or_path,
+                    ),
                    layer.fd_config.model_config.model,
                )
            )
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -265,7 +265,7 @@ class FusedMoE(nn.Layer):
                            if up_gate_proj_expert_weight_key_name in state_dict
                            else up_gate_proj_expert_weight_key_name
                        ),
-                        self.fd_config.parallel_config.model_name_or_path,
+                        self.fd_config.model_config.model,
                    )
                )
                down_proj_weights.append(
@@ -275,7 +275,7 @@ class FusedMoE(nn.Layer):
                            if down_proj_expert_weight_key_name in state_dict
                            else down_proj_expert_weight_key_name
                        ),
-                        self.fd_config.parallel_config.model_name_or_path,
+                        self.fd_config.model_config.model,
                    )
                )
        else:
@@ -291,7 +291,7 @@ class FusedMoE(nn.Layer):
                        if gate_expert_weight_key_name in state_dict
                        else gate_expert_weight_key_name
                    ),
-                    self.fd_config.parallel_config.model_name_or_path,
+                    self.fd_config.model_config.model,
                )
                up = get_tensor(
                    (
@@ -299,7 +299,7 @@ class FusedMoE(nn.Layer):
                        if up_expert_weight_key_name in state_dict
                        else up_expert_weight_key_name
                    ),
-                    self.fd_config.parallel_config.model_name_or_path,
+                    self.fd_config.model_config.model,
                )
                up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
                down_proj_weights.append(
@@ -309,7 +309,7 @@ class FusedMoE(nn.Layer):
                            if down_proj_expert_weight_key_name in state_dict
                            else down_proj_expert_weight_key_name
                        ),
-                        self.fd_config.parallel_config.model_name_or_path,
+                        self.fd_config.model_config.model,
                    )
                )
        return up_gate_proj_weights, down_proj_weights, logical_expert_ids
--- a/fastdeploy/model_executor/model_loader.py
+++ b/fastdeploy/model_executor/model_loader.py
@@ -111,7 +111,7 @@ class DefaultModelLoader(BaseModelLoader):
        # TODO(gongshaotian): Now, only support safetensor
        model_class = MODEL_CLASSES[architectures]
        state_dict = load_composite_checkpoint(
-            fd_config.parallel_config.model_name_or_path,
+            fd_config.model_config.model,
            model_class,
            fd_config,
            return_numpy=True,
--- a/fastdeploy/rl/dynamic_weight_manager.py
+++ b/fastdeploy/rl/dynamic_weight_manager.py
@@ -82,7 +82,7 @@ class DynamicWeightManager:
    def _update_ipc_snapshot(self):
        """Update using IPC snapshot strategy for elastic recovery."""
        model_path = os.path.join(
-            self.parallel_config.model_name_or_path,
+            self.model_config.model,
            f"model_state.tp0{self.meta_src_id}.pdparams",
        )
--- a/fastdeploy/rl/rollout_config.py
+++ b/fastdeploy/rl/rollout_config.py
@@ -60,7 +60,7 @@ class RolloutModelConfig:
        local_rank: int = 0,
    ):
        # Required parameters
-        self.model_name_or_path = model_name_or_path
+        self.model = model_name_or_path
        self.max_model_len = max_model_len
        self.tensor_parallel_size = tensor_parallel_size
        self.dynamic_load_weight = dynamic_load_weight
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -73,7 +73,7 @@ class MTPProposer(Proposer):
        self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM"
        self.speculative_config.sharing_model = main_model
        self.model_config.num_hidden_layers = 1
-        self.parallel_config.model_name_or_path = self.speculative_config.model
+        self.model_config.model = self.speculative_config.model
        self.model_config.pretrained_config.prefix_name = "ernie.mtp_block"
        if self.speculative_config.quantization != "":
            self.model_config.quantization = self.speculative_config.quantization
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1484,8 +1484,8 @@ class GPUModelRunner(ModelRunnerBase):
    def _init_image_preprocess(self) -> None:
        processor = DataProcessor(
-            tokenizer_name=self.parallel_config.model_name_or_path,
+            tokenizer_name=self.model_config.model,
-            image_preprocessor_name=str(self.parallel_config.model_name_or_path),
+            image_preprocessor_name=str(self.model_config.model),
        )
        processor.eval()
        image_preprocess = processor.image_preprocessor
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -101,7 +101,7 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
 def update_fd_config_for_mm(fd_config: FDConfig) -> None:
    if fd_config.model_config.enable_mm:
        tokenizer = ErnieBotTokenizer.from_pretrained(
-            fd_config.parallel_config.model_name_or_path,
+            fd_config.model_config.model,
            model_max_length=fd_config.parallel_config.max_model_len,
            padding_side="right",
            use_fast=False,
@@ -439,7 +439,7 @@ def parse_args():
    parser = argparse.ArgumentParser("FastDeploy LLM Inference")
    parser.add_argument(
        "-m",
-        "--model_name_or_path",
+        "--model",
        type=str,
        default="./output",
        help="model dir",