diff --git a/fastdeploy/distributed/communication_op.py b/fastdeploy/distributed/communication_op.py index d4ad8d6da..70a368e98 100644 --- a/fastdeploy/distributed/communication_op.py +++ b/fastdeploy/distributed/communication_op.py @@ -17,13 +17,15 @@ import paddle import paddle.distributed as dist - -@paddle.jit.marker.unified -def tensor_model_parallel_all_reduce(input_: paddle.Tensor) -> paddle.Tensor: - """All-reduce the input tensor across model parallel group.""" - if paddle.in_dynamic_mode(): - hcg = dist.fleet.get_hybrid_communicate_group() - mp_group = hcg.get_model_parallel_group() - dist.all_reduce(input_, group=mp_group) - else: - dist.all_reduce(input_) +try: + @paddle.jit.marker.unified + def tensor_model_parallel_all_reduce(input_: paddle.Tensor) -> paddle.Tensor: + """All-reduce the input tensor across model parallel group.""" + if paddle.in_dynamic_mode(): + hcg = dist.fleet.get_hybrid_communicate_group() + mp_group = hcg.get_model_parallel_group() + dist.all_reduce(input_, group=mp_group) + else: + dist.all_reduce(input_) +except: + tensor_model_parallel_all_reduce=None \ No newline at end of file diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 74a234bd1..e0aef5ae0 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -21,7 +21,11 @@ from dataclasses import dataclass, field from typing import List, Optional import paddle -from paddle.nn.functional.flash_attention import flash_attention_v3_varlen + +try: + from paddle.nn.functional.flash_attention import flash_attention_v3_varlen +except: + flash_attention_v3_varlen = None from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index b8dc49e1b..e25db088a 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -293,7 +293,7 @@ class ColumnParallelLinear(LinearBase): ) if self.nranks > 0: # col parallel - _set_var_distributed(self.linear_weight, split_axis=-1) + _set_var_distributed(self.linear_weight, split_axis=1) self.linear_bias = None if self.with_bias: @@ -304,7 +304,7 @@ class ColumnParallelLinear(LinearBase): ) if self.nranks > 0: # col parallel - _set_var_distributed(self.linear_bias, split_axis=-1) + _set_var_distributed(self.linear_bias, split_axis=1) # smooth quant self.linear_shift = None diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index a14b4e2cc..3219afc21 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -89,6 +89,7 @@ class FusedMoE(nn.Layer): self.routed_scaling_factor = routed_scaling_factor moe_quant_config = fd_config.quant_config + self.moe_quant_type = None if moe_quant_config: self.quant_method = moe_quant_config.get_quant_method(self) self.moe_quant_type = moe_quant_config.name() @@ -142,7 +143,7 @@ class FusedMoE(nn.Layer): if self.moe_quant_type == "fp8": #(TODO:gaoziyuan) pass - else: + elif self.moe_quant_type == "wint8": self.weight_dtype = "int8" self.init_weight_only_scale() diff --git a/fastdeploy/model_executor/model_loader.py b/fastdeploy/model_executor/model_loader.py index 03ea7fcc6..cb037c2c6 100644 --- a/fastdeploy/model_executor/model_loader.py +++ b/fastdeploy/model_executor/model_loader.py @@ -91,8 +91,11 @@ class DefaultModelLoader(BaseModelLoader): def load_model(self, fd_config: FDConfig) -> nn.Layer: context = paddle.LazyGuard() architectures = fd_config.model_config.architectures[0] - # TODO(gongshaotian): Now, only support safetensor - model_class = MODEL_CLASSES[architectures] + + if fd_config.load_config.dynamic_load_weight: + # register rl model + import fastdeploy.rl + architectures = architectures + "RL" with context: model_cls = ModelRegistry.get_class(architectures) @@ -104,6 +107,8 @@ class DefaultModelLoader(BaseModelLoader): if fd_config.load_config.dynamic_load_weight: return model + # TODO(gongshaotian): Now, only support safetensor + model_class = MODEL_CLASSES[architectures] state_dict = load_composite_checkpoint( fd_config.parallel_config.model_name_or_path, model_class, diff --git a/fastdeploy/model_executor/models/__init__.py b/fastdeploy/model_executor/models/__init__.py index 2ace118ef..366a7f4a4 100644 --- a/fastdeploy/model_executor/models/__init__.py +++ b/fastdeploy/model_executor/models/__init__.py @@ -36,8 +36,7 @@ def _find_py_files(root_dir): def auto_models_registry(dir_path, - register_path="fastdeploy.model_executor.models", - suffix=""): + register_path="fastdeploy.model_executor.models"): """ auto registry all models in this folder """ @@ -49,7 +48,7 @@ def auto_models_registry(dir_path, if inspect.isclass(attr) and issubclass( attr, ModelForCasualLM) and attr is not ModelForCasualLM: - ModelRegistry.register(attr, suffix=suffix) + ModelRegistry.register(attr) except ImportError: raise ImportError(f"{module_file=} import error") diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 2e252fd0e..c4b1b8904 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -28,12 +28,12 @@ class ModelRegistry: _registry = {} @classmethod - def register(cls, model_class, suffix=""): + def register(cls, model_class): """register model class""" if issubclass( model_class, ModelForCasualLM) and model_class is not ModelForCasualLM: - cls._registry[f"{model_class.name()}{suffix}"] = model_class + cls._registry[model_class.name()] = model_class return model_class @classmethod diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 0a5912afb..242d6f9da 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -302,6 +302,7 @@ class Qwen2ForCausalLM(ModelForCasualLM): """ super(Qwen2ForCausalLM, self).__init__(fd_config) + self.fd_config =fd_config self.model = Qwen2Model(fd_config=fd_config) self.ori_vocab_size = fd_config.model_config.ori_vocab_size diff --git a/fastdeploy/model_executor/ops/gpu/__init__.py b/fastdeploy/model_executor/ops/gpu/__init__.py index 0a6c9e3a3..49ed5e0ea 100644 --- a/fastdeploy/model_executor/ops/gpu/__init__.py +++ b/fastdeploy/model_executor/ops/gpu/__init__.py @@ -13,10 +13,19 @@ # limitations under the License. """fastdeploy gpu ops""" -import os +import sys + from fastdeploy.import_ops import import_custom_ops PACKAGE = "fastdeploy.model_executor.ops.gpu" import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals()) import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) + + +def tolerant_import_error(): + class NoneModule: + def __getattr__(self, name): + return None + + sys.modules[__name__] = NoneModule() diff --git a/fastdeploy/rl/__init__.py b/fastdeploy/rl/__init__.py index f8f042712..fc06329d5 100644 --- a/fastdeploy/rl/__init__.py +++ b/fastdeploy/rl/__init__.py @@ -17,4 +17,4 @@ import os from fastdeploy.model_executor.models import auto_models_registry -auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl", suffix="RL") +auto_models_registry(os.path.dirname(__file__), "fastdeploy.rl") diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py new file mode 100644 index 000000000..045214f76 --- /dev/null +++ b/fastdeploy/rl/rollout_config.py @@ -0,0 +1,108 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from fastdeploy.worker.worker_process import initialize_fd_config + + +class RolloutModelConfig: + def __init__( + self, + model_name_or_path: str, + max_model_len: int = 32768, + tensor_parallel_size: int = 4, + dynamic_load_weight: bool = True, + load_strategy: str = "meta", + enable_mm: bool = False, + # Default values for all other parameters + max_num_seqs: int = 34, + total_block_num: int = 2000, + block_size: int = 64, + engine_worker_queue_port: int = 9923, + device_ids: str = "0", + dtype: str = "bfloat16", + enc_dec_block_num: int = 1, + kv_cache_ratio: float = 0.7, + first_token_id: int = 1, + gpu_memory_utilization: float = 0.9, + engine_pid: int = None, + do_profile: bool = False, + pad_token_id: int = -1, + eos_tokens_lens: int = 2, + enable_chunked_prefill: bool = False, + speculative_method: str = None, + speculative_max_draft_token_num: int = 1, + speculative_model_name_or_path: str = "", + speculative_model_quantization: str = "WINT8", + max_num_batched_tokens: int = 2048, + enable_prefix_caching: bool = False, + splitwise_role: str = "mixed", + expert_parallel_size: int = 1, + enable_expert_parallell: bool = False, + ori_vocab_size: int = None, + quantization: str = "None", + enable_static_graph_inference: bool = False, + use_cudagraph: bool = False, + max_capture_batch_size: int = 64, + guided_decoding_backend: str = "off", + disable_any_whitespace: bool = True, + ): + # Required parameters + self.model_name_or_path = model_name_or_path + self.max_model_len = max_model_len + self.tensor_parallel_size = tensor_parallel_size + self.dynamic_load_weight = dynamic_load_weight + self.load_strategy = load_strategy + self.enable_mm = enable_mm + + # Optional parameters with defaults + self.max_num_seqs = max_num_seqs + self.total_block_num = total_block_num + self.block_size = block_size + self.engine_worker_queue_port = engine_worker_queue_port + self.device_ids = device_ids + self.dtype = dtype + self.enc_dec_block_num = enc_dec_block_num + self.kv_cache_ratio = kv_cache_ratio + self.first_token_id = first_token_id + self.gpu_memory_utilization = gpu_memory_utilization + self.engine_pid = engine_pid + self.do_profile = do_profile + self.pad_token_id = pad_token_id + self.eos_tokens_lens = eos_tokens_lens + self.enable_chunked_prefill = enable_chunked_prefill + self.speculative_method = speculative_method + self.speculative_max_draft_token_num = speculative_max_draft_token_num + self.speculative_model_name_or_path = speculative_model_name_or_path + self.speculative_model_quantization = speculative_model_quantization + self.max_num_batched_tokens = max_num_batched_tokens + self.enable_prefix_caching = enable_prefix_caching + self.splitwise_role = splitwise_role + self.expert_parallel_size = expert_parallel_size + self.enable_expert_parallell = enable_expert_parallell + self.ori_vocab_size = ori_vocab_size + self.quantization = quantization + self.enable_static_graph_inference = enable_static_graph_inference + self.use_cudagraph = use_cudagraph + self.max_capture_batch_size = max_capture_batch_size + self.guided_decoding_backend = guided_decoding_backend + self.disable_any_whitespace = disable_any_whitespace + + def __str__(self): + return "\n".join(f"{k}: {v}" for k, v in self.__dict__.items()) + + def initialize(self): + """Initialize the final fd config""" + return initialize_fd_config(self) diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 53e453274..6498fd318 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -24,25 +24,18 @@ from fastdeploy.config import FDConfig from fastdeploy.model_executor.model_loader import ModelRegistry from fastdeploy.model_executor.models.ernie4_5_moe import \ Ernie4_5_MoeForCausalLM -from fastdeploy.model_executor.models.qwen2 import Qwen2PretrainedModel -from fastdeploy.model_executor.models.qwen3 import Qwen3PretrainedModel -from fastdeploy.model_executor.models.qwen3moe import Qwen3MoePretrainedModel +from fastdeploy.model_executor.models.qwen2 import Qwen2ForCausalLM +from fastdeploy.model_executor.models.qwen3 import Qwen3ForCausalLM +from fastdeploy.model_executor.models.qwen3moe import Qwen3MoeForCausalLM +from fastdeploy.rl.rollout_config import RolloutModelConfig -RL_MODEL_CLASSES = { - "Ernie4_5_MoeForCausalLMRL": Ernie4_5_MoeForCausalLM, - "Qwen2ForCausalLMRL": Qwen2PretrainedModel, - "Qwen3ForCausalLMRL": Qwen3PretrainedModel, - "Qwen3MoeForCausalLMRL": Qwen3MoePretrainedModel, -} - - -class RollOutModel(nn.Layer): +class RolloutModel(nn.Layer): """Main model class for rollout operations, supports multimodal components for train.""" - def __init__(self, fd_config: FDConfig): + def __init__(self, rollout_model_config: RolloutModelConfig): """Initialize with FastDeploy configuration.""" - super(RollOutModel, self).__init__() - self.fd_config = fd_config + super(RolloutModel, self).__init__() + self.fd_config = rollout_model_config.initialize() self._init_models() def _init_models(self): @@ -90,9 +83,9 @@ class RollOutModel(nn.Layer): all_params = {} for model in self.rollout_models: for name, param in model.state_dict().items(): - logger.debug( - f"Model param: {name}, shape={param.shape}, dtype={param.dtype}" - ) + # logger.debug( + # f"Model param: {name}, shape={param.shape}, dtype={param.dtype}" + # ) all_params[name] = param return all_params @@ -123,11 +116,13 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM): # Initialize mapping dictionary infer_to_train = {} + infer_base_name = "model" + train_base_name = "ernie" # Static mappings (non-layer specific) static_mappings = { - "model.embeddings.word_embeddings.weight": - "ernie.embed_tokens.weight", - "model.norm.ln_weight": "ernie.norm.weight", + f"{infer_base_name}.embeddings.word_embeddings.weight": + f"{train_base_name}.embed_tokens.weight", + f"{infer_base_name}.norm.ln_weight": f"{train_base_name}.norm.weight", "lm_head.out_linear.weight": "lm_head.weight" } if self.fd_config.model_config.get("weight_sharing", False): @@ -135,53 +130,55 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM): logger.debug("enable tie_word_embeddings") static_mappings.pop("lm_head.out_linear.weight") infer_to_train.update(static_mappings) - infer_base_name = "model.hidden_layers" + + infer_base_name = infer_base_name + ".hidden_layers" + train_base_name = train_base_name + ".layers" # Helper function to add layer mappings def _add_layer_mappings(layer_idx, is_moe_layer=False): # Handle special case for layer 0's input layernorm for ph in place_holders: infer_key = f"{infer_base_name}.{layer_idx}.input_layernorm.ln_{ph}" - train_key = f"ernie.layers.{layer_idx}.input_layernorm.{ph}" + train_key = f"{train_base_name}.{layer_idx}.input_layernorm.{ph}" infer_to_train[infer_key] = train_key # Common attention mappings for ph in place_holders: infer_to_train[f"{infer_base_name}.{layer_idx}.self_attn.qkv_proj.linear_{ph}"] = \ - f"ernie.layers.{layer_idx}.self_attn.qkv_proj.{ph}" + f"{train_base_name}.{layer_idx}.self_attn.qkv_proj.{ph}" infer_to_train[f"{infer_base_name}.{layer_idx}.self_attn.o_proj.linear_{ph}"] = \ - f"ernie.layers.{layer_idx}.self_attn.o_proj.{ph}" + f"{train_base_name}.{layer_idx}.self_attn.o_proj.{ph}" # Post-attention layernorm for ph in place_holders: infer_to_train[f"{infer_base_name}.{layer_idx}.post_attention_layernorm.ln_{ph}"] = \ - f"ernie.layers.{layer_idx}.post_attention_layernorm.{ph}" + f"{train_base_name}.{layer_idx}.post_attention_layernorm.{ph}" if not is_moe_layer: # Dense FFN mappings for ph in place_holders: infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.gate_up_proj.linear_{ph}"] = \ - f"ernie.layers.{layer_idx}.mlp.up_gate_proj.{ph}" + f"{train_base_name}.{layer_idx}.mlp.up_gate_proj.{ph}" infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.down_proj.linear_{ph}"] = \ - f"ernie.layers.{layer_idx}.mlp.down_proj.{ph}" + f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}" else: # MoE specific mappings infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \ - f"ernie.layers.{layer_idx}.mlp.gate.weight" + f"{train_base_name}.{layer_idx}.mlp.gate.weight" if self.fd_config.moe_config.moe_use_aux_free: infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ - f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + f"{train_base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" # Support shared experts if self.fd_config.model_config.get( "moe_num_shared_experts") > 0: infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.shared_experts.gate_up_proj.linear_weight"] = \ - f"ernie.layers.{layer_idx}.mlp.shared_experts.up_gate_proj.weight" + f"{train_base_name}.{layer_idx}.mlp.shared_experts.up_gate_proj.weight" infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.shared_experts.down_proj.linear_weight"] = \ - f"ernie.layers.{layer_idx}.mlp.shared_experts.down_proj.weight" + f"{train_base_name}.{layer_idx}.mlp.shared_experts.down_proj.weight" # MoE experts mappings for expert_idx in range(self.fd_config.moe_config.num_experts): @@ -191,7 +188,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM): if ffn1_key not in infer_to_train: infer_to_train[ffn1_key] = [] infer_to_train[ffn1_key].append( - f"ernie.layers.{layer_idx}.mlp.experts.{expert_idx}.up_gate_proj.{ph}" + f"{train_base_name}.{layer_idx}.mlp.experts.{expert_idx}.up_gate_proj.{ph}" ) # FFN2 (down_proj) @@ -199,7 +196,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM): if ffn2_key not in infer_to_train: infer_to_train[ffn2_key] = [] infer_to_train[ffn2_key].append( - f"ernie.layers.{layer_idx}.mlp.experts.{expert_idx}.down_proj.{ph}" + f"{train_base_name}.{layer_idx}.mlp.experts.{expert_idx}.down_proj.{ph}" ) # Process non-MoE layers @@ -213,3 +210,118 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM): _add_layer_mappings(layer_idx, is_moe_layer=True) return infer_to_train + + +class Qwen2ForCausalLMRL(Qwen2ForCausalLM): + """ + Qwen2ForCausalLMRL + """ + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Qwen2ForCausalLMRL, self).__init__(fd_config) + + @classmethod + def name(self): + """name""" + return "Qwen2ForCausalLMRL" + + def get_name_mappings_to_training(self): + """Generate mapping between inference and training parameter for RL(donot delete!).""" + # Prepare placeholders + place_holders = ["weight"] + + # Initialize mapping dictionary + infer_to_train = {} + + infer_base_name = "model" + train_base_name = "qwen2" + # Static mappings (non-layer specific) + static_mappings = { + f"{infer_base_name}.embeddings.word_embeddings.weight": + f"{train_base_name}.embed_tokens.weight", + f"{infer_base_name}.norm.ln_weight": f"{train_base_name}.norm.weight", + "lm_head.out_linear.weight": "lm_head.weight" + } + infer_to_train.update(static_mappings) + + infer_base_name = infer_base_name + ".layers" + train_base_name = train_base_name + ".layers" + + # Helper function to add layer mappings + def _add_layer_mappings(layer_idx): + # Handle special case for layer 0's input layernorm and attn o_proj + for ph in place_holders: + infer_key = f"{infer_base_name}.{layer_idx}.input_layernorm.ln_{ph}" + train_key = f"{train_base_name}.{layer_idx}.input_layernorm.{ph}" + infer_to_train[infer_key] = train_key + + infer_to_train[f"{infer_base_name}.{layer_idx}.self_attn.o_proj.linear_{ph}"] = \ + f"{train_base_name}.{layer_idx}.self_attn.o_proj.{ph}" + + # qwen qkv proj need bias + for ph in ["weight", "bias"]: + infer_to_train[f"{infer_base_name}.{layer_idx}.self_attn.qkv_proj.linear_{ph}"] = \ + f"{train_base_name}.{layer_idx}.self_attn.qkv_proj.{ph}" + + # Post-attention layernorm + for ph in place_holders: + infer_to_train[f"{infer_base_name}.{layer_idx}.post_attention_layernorm.ln_{ph}"] = \ + f"{train_base_name}.{layer_idx}.post_attention_layernorm.{ph}" + + # FFN mappings + for ph in place_holders: + infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.gate_up_proj.linear_{ph}"] = \ + f"{train_base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" + + infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.down_proj.linear_{ph}"] = \ + f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}" + + for layer_idx in range( + self.fd_config.model_config.num_layers): + _add_layer_mappings(layer_idx) + + return infer_to_train + + +class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM): + """ + Qwen3MoeForCausalLMRL + """ + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Qwen3MoeForCausalLMRL, self).__init__(fd_config) + + @classmethod + def name(self): + """name""" + return "Qwen3MoeForCausalLMRL" + + def get_name_mappings_to_training(self): + """Generate mapping between inference and training parameter for RL(donot delete!).""" + pass + + +class Qwen3ForCausalLMRL(Qwen3ForCausalLM): + """ + Qwen3ForCausalLMRL + """ + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Qwen3ForCausalLMRL, self).__init__(fd_config) + + @classmethod + def name(self): + """name""" + return "Qwen3ForCausalLMRL" \ No newline at end of file diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 2b2642c2a..a94b2b4de 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -511,7 +511,7 @@ def parse_args(): parser.add_argument("--quantization", type=str, - default="", + default="None", help="Quantization name for the model, currentlly support " \ "'wint4', 'wint8'," \ "default is None. The priority of this configuration "\ @@ -555,148 +555,169 @@ def parse_args(): return args -def initialize_fd_config(args: argparse.Namespace) -> FDConfig: - """Initialize FDConfig - TODO(gongshaotian): Unified all configs to FDConfig +def initialize_fd_config(config) -> FDConfig: + """Initialize FDConfig from either RolloutModelConfig or argparse.Namespace + + Args: + config: Configuration object containing all parameters (either RolloutModelConfig or argparse.Namespace) + + Returns: + FDConfig: Initialized FastDeploy configuration object """ - # NOTE(gongshaotian): From build stream line model - config, _ = ModelConfig.get_config_dict(args.model_name_or_path) - if 'num_experts' in config: - config['moe_num_experts'] = config.pop('num_experts') + # Get model config from model directory + model_config_dict, _ = ModelConfig.get_config_dict(config.model_name_or_path) - if 'num_experts_per_tok' in config: - config['moe_topk'] = config.pop('num_experts_per_tok') - config["head_dim"] = config.get( - "head_dim", config["hidden_size"] // config["num_attention_heads"]) - config["rope_theta"] = config.get("rope_theta", 10000.0) - model_config = ModelConfig.from_dict(config) - # TODO Set `head_dim` again. Because `ModelConfig` class doesn't support feeding head_dim at all! - model_config.head_dim = config["head_dim"] - paddle.set_default_dtype(args.dtype) + # Handle MoE related configs + if 'num_experts' in model_config_dict: + model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts') + if 'num_experts_per_tok' in model_config_dict: + model_config_dict['moe_topk'] = model_config_dict.pop('num_experts_per_tok') + # Set default values for model config + model_config_dict["head_dim"] = model_config_dict.get( + "head_dim", model_config_dict["hidden_size"] // model_config_dict["num_attention_heads"]) + model_config_dict["rope_theta"] = model_config_dict.get("rope_theta", 10000.0) + + # Create model config object + model_config = ModelConfig.from_dict(model_config_dict) + model_config.head_dim = model_config_dict["head_dim"] + paddle.set_default_dtype(config.dtype) + + # Initialize all config components device_config = DeviceConfig() - # model_config = ModelConfig() - decoding_config = DecodingConfig() - speculative_config = SpeculativeConfig() parallel_config = ParallelConfig() load_config = LoadConfig() moe_config = MoEConfig() - graph_opt_config = GraphOptimizationConfig( - args.enable_static_graph_inference, args.use_cudagraph, - args.max_capture_batch_size) - model_config.quantization = args.quantization - # Update speculate config - speculative_config.method = args.speculative_method - speculative_config.num_speculative_tokens = args.speculative_max_draft_token_num - speculative_config.model_name_or_path = args.speculative_model_name_or_path - speculative_config.quantization = args.speculative_model_quantization + # Handle graph optimization config (check for attribute existence for backward compatibility) + enable_static_graph_inference = getattr(config, 'enable_static_graph_inference', False) + use_cudagraph = getattr(config, 'use_cudagraph', False) + max_capture_batch_size = getattr(config, 'max_capture_batch_size', 0) + + graph_opt_config = GraphOptimizationConfig( + enable_static_graph_inference, + use_cudagraph, + max_capture_batch_size + ) + + # Handle quantization (check for attribute existence) + model_config.quantization = getattr(config, 'quantization', None) + + # Update speculative config + speculative_config.method = getattr(config, 'speculative_method', None) + speculative_config.num_speculative_tokens = getattr(config, 'speculative_max_draft_token_num', 0) + speculative_config.model_name_or_path = getattr(config, 'speculative_model_name_or_path', None) + speculative_config.quantization = getattr(config, 'speculative_model_quantization', None) # Update parallel config - parallel_config.engine_pid = args.engine_pid - parallel_config.model_name_or_path = args.model_name_or_path - parallel_config.max_num_seqs = args.max_num_seqs - parallel_config.max_block_num = args.total_block_num - parallel_config.block_size = args.block_size - parallel_config.engine_worker_queue_port = args.engine_worker_queue_port - parallel_config.max_model_len = args.max_model_len - model_config.max_seq_len = args.max_model_len - model_config.max_length = args.max_model_len - parallel_config.device_ids = args.device_ids - parallel_config.dtype = args.dtype - parallel_config.enc_dec_block_num = args.enc_dec_block_num - parallel_config.kv_cache_ratio = args.kv_cache_ratio - parallel_config.first_token_id = args.first_token_id - parallel_config.gpu_memory_utilization = args.gpu_memory_utilization - parallel_config.engine_pid = args.engine_pid - parallel_config.do_profile = args.do_profile - parallel_config.dynamic_load_weight = args.dynamic_load_weight - parallel_config.pad_token_id = args.pad_token_id - parallel_config.eos_tokens_lens = args.eos_tokens_lens - parallel_config.enable_chunked_prefill = args.enable_chunked_prefill - parallel_config.max_num_batched_tokens = args.max_num_batched_tokens - parallel_config.enable_prefix_caching = args.enable_prefix_caching + parallel_config.engine_pid = getattr(config, 'engine_pid', None) + parallel_config.model_name_or_path = config.model_name_or_path + parallel_config.max_num_seqs = getattr(config, 'max_num_seqs', 0) + parallel_config.max_block_num = getattr(config, 'total_block_num', 0) + parallel_config.block_size = getattr(config, 'block_size', 0) + parallel_config.engine_worker_queue_port = getattr(config, 'engine_worker_queue_port', 0) + parallel_config.max_model_len = getattr(config, 'max_model_len', 0) + model_config.max_seq_len = getattr(config, 'max_model_len', 0) + model_config.max_length = getattr(config, 'max_model_len', 0) + parallel_config.device_ids = getattr(config, 'device_ids', []) + parallel_config.dtype = config.dtype + parallel_config.enc_dec_block_num = getattr(config, 'enc_dec_block_num', 0) + parallel_config.kv_cache_ratio = getattr(config, 'kv_cache_ratio', 1.0) + parallel_config.first_token_id = getattr(config, 'first_token_id', None) + parallel_config.gpu_memory_utilization = getattr(config, 'gpu_memory_utilization', 0.9) + parallel_config.engine_pid = getattr(config, 'engine_pid', None) + parallel_config.do_profile = getattr(config, 'do_profile', False) + parallel_config.dynamic_load_weight = getattr(config, 'dynamic_load_weight', False) + parallel_config.pad_token_id = getattr(config, 'pad_token_id', None) + parallel_config.eos_tokens_lens = getattr(config, 'eos_tokens_lens', 0) + parallel_config.enable_chunked_prefill = getattr(config, 'enable_chunked_prefill', False) + parallel_config.max_num_batched_tokens = getattr(config, 'max_num_batched_tokens', 0) + parallel_config.enable_prefix_caching = getattr(config, 'enable_prefix_caching', False) + parallel_config.use_ep = getattr(config, 'enable_expert_parallell', False) + parallel_config.tensor_parallel_degree = getattr(config, 'tensor_parallel_size', 1) + parallel_config.expert_parallel_degree = getattr(config, 'expert_parallel_size', 1) + parallel_config.splitwise_role = getattr(config, 'splitwise_role', None) + parallel_config.guided_decoding_backend = getattr(config, 'guided_decoding_backend', None) + parallel_config.disable_any_whitespace = getattr(config, 'disable_any_whitespace', False) - parallel_config.use_ep = args.enable_expert_parallell - parallel_config.tensor_parallel_degree = args.tensor_parallel_size - parallel_config.expert_parallel_degree = args.expert_parallel_size - parallel_config.splitwise_role = args.splitwise_role + # Handle load config (check for environment variable) load_config.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 - parallel_config.guided_decoding_backend = args.guided_decoding_backend - parallel_config.disable_any_whitespace = args.disable_any_whitespace - + # Log parallel config info logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") - logger.info( - f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}" - ) - logger.info(f"args.splitwise_role {args.splitwise_role}") + logger.info(f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}") + logger.info(f"splitwise_role {parallel_config.splitwise_role}") - if args.splitwise_role == "mixed": + # Set MoE phase based on splitwise role + if parallel_config.splitwise_role == "mixed": parallel_config.moe_phase = MoEPhase.PREFILL - elif args.splitwise_role == "prefill": + elif parallel_config.splitwise_role == "prefill": parallel_config.moe_phase = MoEPhase.PREFILL - elif args.splitwise_role == "decode": + elif parallel_config.splitwise_role == "decode": parallel_config.moe_phase = MoEPhase.DECODER - else: + elif parallel_config.splitwise_role is not None: raise NotImplementedError - num_key_value_heads = config.get("num_key_value_heads", -1) + # Handle model architecture specific configurations + num_key_value_heads = model_config_dict.get("num_key_value_heads", -1) if num_key_value_heads is None: num_key_value_heads = -1 - if config.get("ffn_hidden_size", None) is not None: - ffn_hidden_size = config["ffn_hidden_size"] - elif config.get("intermediate_size", None) is not None: - ffn_hidden_size = config["intermediate_size"] + # Calculate FFN hidden size + if model_config_dict.get("ffn_hidden_size", None) is not None: + ffn_hidden_size = model_config_dict["ffn_hidden_size"] + elif model_config_dict.get("intermediate_size", None) is not None: + ffn_hidden_size = model_config_dict["intermediate_size"] else: - ffn_hidden_size = 4 * config["hidden_size"] - if config["hidden_act"].lower() == "swiglu": + ffn_hidden_size = 4 * model_config_dict["hidden_size"] + if model_config_dict["hidden_act"].lower() == "swiglu": if paddle.distributed.get_world_size() > 1: - multiple_of = 8 * config["num_attention_heads"] + multiple_of = 8 * model_config_dict["num_attention_heads"] else: - multiple_of = 4 * config["num_attention_heads"] + multiple_of = 4 * model_config_dict["num_attention_heads"] ffn_hidden_size = multiple_of * ( (int(2 * ffn_hidden_size / 3) + multiple_of - 1) // multiple_of) - num_layers = config.get("num_layers", None) or config.get( + # Get number of layers + num_layers = model_config_dict.get("num_layers", None) or model_config_dict.get( "num_hidden_layers", None) if num_layers is None: raise ValueError(f"num_layers<{num_layers}> is invalid") - use_moe = config.get("moe_layer_start_index", num_layers) < num_layers + use_moe = model_config_dict.get("moe_layer_start_index", num_layers) < num_layers + # Update model config model_config.ffn_hidden_size = ffn_hidden_size model_config.num_layers = num_layers - model_config.num_key_value_heads = num_key_value_heads - model_config.start_layer_index = config.get("start_layer_index", 0) - moe_config.num_experts = config.get("moe_num_experts", None) - moe_config.moe_intermediate_size = config.get("moe_intermediate_size", - None) - moe_config.top_k = config.get("moe_k", config.get("moe_topk", 8)) - moe_config.moe_num_shared_experts = config.get("moe_num_shared_experts", 0) - moe_config.moe_layer_start_index = config.get("moe_layer_start_index", 0) + model_config.start_layer_index = model_config_dict.get("start_layer_index", 0) - moe_config.num_max_dispatch_tokens_per_rank = config.get( + # Update MoE config + moe_config.num_experts = model_config_dict.get("moe_num_experts", None) + moe_config.moe_intermediate_size = model_config_dict.get("moe_intermediate_size", None) + moe_config.top_k = model_config_dict.get("moe_k", model_config_dict.get("moe_topk", 8)) + moe_config.moe_num_shared_experts = model_config_dict.get("moe_num_shared_experts", 0) + moe_config.moe_layer_start_index = model_config_dict.get("moe_layer_start_index", 0) + moe_config.num_max_dispatch_tokens_per_rank = model_config_dict.get( "num_max_dispatch_tokens_per_rank", 256) - moe_config.moe_use_aux_free = config.get("moe_use_aux_free", False) + moe_config.moe_use_aux_free = model_config_dict.get("moe_use_aux_free", False) - model_config.ori_vocab_size = config.get("vocab_size", -1) - if "Ernie4_5_ForCausalLM" in config.get("architectures"): - model_config.ori_vocab_size = args.ori_vocab_size + # Handle vocabulary size + model_config.ori_vocab_size = model_config_dict.get("vocab_size", -1) + if "Ernie4_5_ForCausalLM" in model_config_dict.get("architectures", []): + model_config.ori_vocab_size = getattr(config, 'ori_vocab_size', model_config.ori_vocab_size) - if "DeepseekV3ForCausalLM" in config.get("architectures"): + # Handle DeepseekV3 specific config + if "DeepseekV3ForCausalLM" in model_config_dict.get("architectures", []): from paddleformers.transformers import AutoConfig model_config.deepseekv3 = AutoConfig.from_pretrained( - args.model_name_or_path) + config.model_name_or_path) - #TODO(@yuanrisheng): kv_cache quant config can only be - # stored in model config file, which should be unified - quantization_config = config.get("quantization_config", None) + # Handle quantization config + quantization_config = model_config_dict.get("quantization_config", None) if not model_config.is_quantized: if quantization_config is not None: if "kv_cache_quant_type" not in quantization_config: @@ -711,13 +732,13 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: if quantization_config is not None: quant_config_name = quantization_config["quantization"] - elif args.quantization != "None": + elif getattr(config, 'quantization', None) != "None": quantization_config = {} - quant_config_name = args.quantization + quant_config_name = getattr(config, 'quantization', None) quantization_config["quantization"] = quant_config_name - # use some trick code for ernie model and will unify it in future. - is_ernie = "Ernie4_5_ForCausalLM" in config.get("architectures") or \ - "Ernie4_5_MoeForCausalLM" in config.get("architectures") + # Special handling for Ernie models + is_ernie = "Ernie4_5_ForCausalLM" in model_config_dict.get("architectures", []) or \ + "Ernie4_5_MoeForCausalLM" in model_config_dict.get("architectures", []) if use_moe and quant_config_name == "wint4" and is_ernie: quantization_config["dense_quant_type"] = "wint8" quantization_config["moe_quant_type"] = "wint4" @@ -732,6 +753,7 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: quant_cls = get_quantization_config(quant_config_name) quant_config = quant_cls.from_config(quantization_config) + # Log quantization info logger.info("===========quantization_config==============") if quant_config is not None: if model_config.is_quantized: @@ -742,29 +764,33 @@ def initialize_fd_config(args: argparse.Namespace) -> FDConfig: logger.info( "Model Status: Original (will apply online quantization)") - logger.info(f"Quantization Method: {args.quantization or 'None'}") + logger.info(f"Quantization Method: {getattr(config, 'quantization', 'None')}") else: logger.info( "No quantization config found and use original weight and act dtype." ) - model_config.architectures = config.get("architectures") + model_config.architectures = model_config_dict.get("architectures") + # Update load config logger.info("===========load_config==============") - load_config.dynamic_load_weight = args.dynamic_load_weight - load_config.load_strategy = args.load_strategy + load_config.dynamic_load_weight = getattr(config, 'dynamic_load_weight', False) + load_config.load_strategy = getattr(config, 'load_strategy', None) logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") - fd_config = FDConfig(model_config=model_config, - parallel_config=parallel_config, - speculative_config=speculative_config, - device_config=device_config, - load_config=load_config, - moe_config=moe_config, - decoding_config=decoding_config, - quant_config=quant_config, - graph_opt_config=graph_opt_config) + # Create and return FDConfig + fd_config = FDConfig( + model_config=model_config, + parallel_config=parallel_config, + speculative_config=speculative_config, + device_config=device_config, + load_config=load_config, + moe_config=moe_config, + decoding_config=decoding_config, + quant_config=quant_config, + graph_opt_config=graph_opt_config + ) return fd_config