diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 01b38bedd..40a492126 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -17,9 +17,12 @@ from __future__ import annotations import re +from functools import partial import paddle from paddle import nn +from paddleformers.transformers import PretrainedModel +from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce @@ -504,3 +507,86 @@ class Glm4MoeForCausalLM(ModelForCasualLM): def clear_grpah_opt_backend(self): """Clear graph optimization backend, the captured cuda graph will be cleaned""" self.model.clear_grpah_opt_backend(fd_config=self.fd_config) + + +class Glm4MoePretrainedModel(PretrainedModel): + """ + Glm4MoePretrainedModel + """ + + config_class = FDConfig + + def _init_weight(self, layer): + """ + _init_weight + """ + return None + + @classmethod + def arch_name(self): + return "Glm4MoeForCausalLM" + + @classmethod + def _get_tensor_parallel_mappings(cls, config, is_split=True): + + logger.info("Glm4Moe inference model _get_tensor_parallel_mappings") + + from fastdeploy.model_executor.models.tp_utils import split_or_merge_func_v1 + + fn = split_or_merge_func_v1( + is_split=is_split, + tensor_parallel_degree=config.tensor_parallel_degree, + tensor_parallel_rank=config.tensor_parallel_rank, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + head_dim=config.head_dim, + ) + + def get_tensor_parallel_split_mappings(num_layers): + final_actions = {} + + base_actions = { + "lm_head.weight": partial(fn, is_column=True), + "embed_tokens.weight": partial(fn, is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), + } + + # Self Attention Layer which are need TP. + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) + + # MLP Layer + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.down_proj.weight"] = partial(fn, is_column=False) + + # Moe Layer + for expert_idx in range(config.n_routed_experts): + base_actions[f"layers.0.mlp.experts.{expert_idx}.up_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.gate_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.down_proj.weight"] = partial(fn, is_column=False) + + # Shared Expert Layer + base_actions["layers.0.mlp.shared_experts.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.shared_experts.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.shared_experts.down_proj.weight"] = partial(fn, is_column=False) + + # MTP parts + base_actions["layers.46.embed_tokens.weight"] = partial(fn, is_column=False) + base_actions["layers.46.eh_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.46.shared_head.head.weight"] = partial(fn, is_column=True) + + for key, action in base_actions.items(): + if "layers.0." in key: + for i in range(num_layers): + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action + final_actions[key] = action + + return final_actions + + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers) + return mappings diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 3282e4548..4e52f29e8 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -14,6 +14,7 @@ # limitations under the License. """ +import copy from typing import Dict import paddle @@ -28,6 +29,10 @@ from fastdeploy.model_executor.models.ernie4_5_vl.ernie4_5_vl_moe import ( Ernie4_5_VLMoeForConditionalGeneration, Ernie4_5_VLPretrainedModel, ) +from fastdeploy.model_executor.models.glm4_moe import ( + Glm4MoeForCausalLM, + Glm4MoePretrainedModel, +) from fastdeploy.model_executor.models.model_base import ModelRegistry from fastdeploy.model_executor.models.qwen2 import ( Qwen2ForCausalLM, @@ -529,3 +534,83 @@ class Qwen2_5_VLForConditionalGenerationRL(Qwen2_5_VLForConditionalGeneration, B self._complete_missing_mappings() return self.infer_to_train_mapping + + +class Glm4MoeForCausalLMRL(Glm4MoeForCausalLM, BaseRLModel): + """ + Glm4MoeForCausalLMRL + """ + + _get_tensor_parallel_mappings = Glm4MoePretrainedModel._get_tensor_parallel_mappings + + def __init__(self, fd_config: FDConfig): + """ + Args: + fd_config (FDConfig): Configurations for the LLM model. + """ + super(Glm4MoeForCausalLMRL, self).__init__(fd_config) + + @classmethod + def name(self) -> str: + """name""" + return "Glm4MoeForCausalLMRL" + + def get_name_mappings_to_training(self, trainer_degree=None) -> Dict[str, str]: + """Generate mapping between inference and training parameter for RL(donot delete!).""" + if self._mappings_built: + return self.infer_to_train_mapping + + self.infer_to_train_mapping = {} + self._mappings_built = True + # Prepare placeholders + place_holders = ["weight"] + + # Initialize mapping dictionary + self._update_base_mappings("model") + + base_name = "model.layers" + + # Helper function to add layer mappings + def _add_layer_mappings(layer_idx: int): + # MoE specific mappings + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate.weight"] = ( + f"{base_name}.{layer_idx}.mlp.gate.weight" + ) + + self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate.e_score_correction_bias"] = ( + f"{base_name}.{layer_idx}.mlp.gate.e_score_correction_bias" + ) + + # MoE experts mappings + for expert_idx in range(self.fd_config.model_config.n_routed_experts): + for ph in place_holders: + # up_gate_proj (up_gate_proj) + up_gate_proj_key = f"{base_name}.{layer_idx}.mlp.experts.up_gate_proj_weight" + if up_gate_proj_key not in self.infer_to_train_mapping: + self.infer_to_train_mapping[up_gate_proj_key] = [] + self.infer_to_train_mapping[up_gate_proj_key].append( + f"{base_name}.{layer_idx}.mlp.experts.{expert_idx}.up_gate_proj.{ph}" + ) + + # down_proj (down_proj) + down_proj_key = f"{base_name}.{layer_idx}.mlp.experts.down_proj_weight" + if down_proj_key not in self.infer_to_train_mapping: + self.infer_to_train_mapping[down_proj_key] = [] + self.infer_to_train_mapping[down_proj_key].append( + f"{base_name}.{layer_idx}.mlp.experts.{expert_idx}.down_proj.{ph}" + ) + + # Process MoE layers + for layer_idx in range( + self.fd_config.model_config.first_k_dense_replace, + self.fd_config.model_config.num_hidden_layers, + ): + _add_layer_mappings(layer_idx) + + self._complete_missing_mappings() + infer_to_train_mapping_copy = copy.deepcopy(self.infer_to_train_mapping) + for key in infer_to_train_mapping_copy.keys(): + if "mlp.experts.gate_correction_bias" in key: + self.infer_to_train_mapping.pop(key) + + return self.infer_to_train_mapping diff --git a/tests/ci_use/EB_VL_Lite/test_rollout_model.py b/tests/ci_use/EB_VL_Lite/test_rollout_model.py index 9fbfc4821..e9bcd93cb 100644 --- a/tests/ci_use/EB_VL_Lite/test_rollout_model.py +++ b/tests/ci_use/EB_VL_Lite/test_rollout_model.py @@ -22,8 +22,9 @@ def test_rollout_model_with_distributed_launch(): test_rollout_model """ current_dir = os.path.dirname(os.path.abspath(__file__)) - - rollout_script = os.path.join(current_dir, "rollout_model.py") + utils_dir = os.path.join(os.path.dirname(current_dir), "utils") + rollout_script = os.path.join(utils_dir, "rollout_model.py") + baseline_path = os.path.join(current_dir, "baseline.txt") base_path = os.getenv("MODEL_PATH") if base_path: @@ -40,6 +41,11 @@ def test_rollout_model_with_distributed_launch(): rollout_script, "--model_path", model_path, + "--baseline_path", + baseline_path, + "--enable_mm", + "--quantization", + "wint8", ] print(f"Executing command: {' '.join(command)}") diff --git a/tests/ci_use/GLM-45-AIR/baseline.txt b/tests/ci_use/GLM-45-AIR/baseline.txt new file mode 100644 index 000000000..7fd21998b --- /dev/null +++ b/tests/ci_use/GLM-45-AIR/baseline.txt @@ -0,0 +1,43 @@ +model.embed_tokens.embeddings.weight +model.layers.0.self_attn.qkv_proj.bias +model.layers.0.self_attn.qkv_proj.weight +model.layers.0.self_attn.o_proj.weight +model.layers.0.mlp.up_gate_proj.weight +model.layers.0.mlp.down_proj.weight +model.layers.0.input_layernorm.weight +model.layers.0.post_attention_layernorm.weight +model.layers.1.self_attn.qkv_proj.bias +model.layers.1.self_attn.qkv_proj.weight +model.layers.1.self_attn.o_proj.weight +model.layers.1.mlp.gate.weight +model.layers.1.mlp.gate.e_score_correction_bias +model.layers.1.mlp.experts.gate_correction_bias +model.layers.1.mlp.experts.up_gate_proj_weight +model.layers.1.mlp.experts.down_proj_weight +model.layers.1.mlp.shared_experts.up_gate_proj.weight +model.layers.1.mlp.shared_experts.down_proj.weight +model.layers.1.input_layernorm.weight +model.layers.1.post_attention_layernorm.weight +model.norm.weight +lm_head.linear.weight +model.embed_tokens.embeddings.weight:model.embed_tokens.weight +lm_head.linear.weight:lm_head.weight +model.layers.1.mlp.gate.weight:model.layers.1.mlp.gate.weight +model.layers.1.mlp.gate.e_score_correction_bias:model.layers.1.mlp.gate.e_score_correction_bias +model.layers.1.mlp.experts.up_gate_proj_weight:['model.layers.1.mlp.experts.0.up_gate_proj.weight', 'model.layers.1.mlp.experts.1.up_gate_proj.weight', 'model.layers.1.mlp.experts.2.up_gate_proj.weight', 'model.layers.1.mlp.experts.3.up_gate_proj.weight', 'model.layers.1.mlp.experts.4.up_gate_proj.weight', 'model.layers.1.mlp.experts.5.up_gate_proj.weight', 'model.layers.1.mlp.experts.6.up_gate_proj.weight', 'model.layers.1.mlp.experts.7.up_gate_proj.weight', 'model.layers.1.mlp.experts.8.up_gate_proj.weight', 'model.layers.1.mlp.experts.9.up_gate_proj.weight', 'model.layers.1.mlp.experts.10.up_gate_proj.weight', 'model.layers.1.mlp.experts.11.up_gate_proj.weight', 'model.layers.1.mlp.experts.12.up_gate_proj.weight', 'model.layers.1.mlp.experts.13.up_gate_proj.weight', 'model.layers.1.mlp.experts.14.up_gate_proj.weight', 'model.layers.1.mlp.experts.15.up_gate_proj.weight', 'model.layers.1.mlp.experts.16.up_gate_proj.weight', 'model.layers.1.mlp.experts.17.up_gate_proj.weight', 'model.layers.1.mlp.experts.18.up_gate_proj.weight', 'model.layers.1.mlp.experts.19.up_gate_proj.weight', 'model.layers.1.mlp.experts.20.up_gate_proj.weight', 'model.layers.1.mlp.experts.21.up_gate_proj.weight', 'model.layers.1.mlp.experts.22.up_gate_proj.weight', 'model.layers.1.mlp.experts.23.up_gate_proj.weight', 'model.layers.1.mlp.experts.24.up_gate_proj.weight', 'model.layers.1.mlp.experts.25.up_gate_proj.weight', 'model.layers.1.mlp.experts.26.up_gate_proj.weight', 'model.layers.1.mlp.experts.27.up_gate_proj.weight', 'model.layers.1.mlp.experts.28.up_gate_proj.weight', 'model.layers.1.mlp.experts.29.up_gate_proj.weight', 'model.layers.1.mlp.experts.30.up_gate_proj.weight', 'model.layers.1.mlp.experts.31.up_gate_proj.weight', 'model.layers.1.mlp.experts.32.up_gate_proj.weight', 'model.layers.1.mlp.experts.33.up_gate_proj.weight', 'model.layers.1.mlp.experts.34.up_gate_proj.weight', 'model.layers.1.mlp.experts.35.up_gate_proj.weight', 'model.layers.1.mlp.experts.36.up_gate_proj.weight', 'model.layers.1.mlp.experts.37.up_gate_proj.weight', 'model.layers.1.mlp.experts.38.up_gate_proj.weight', 'model.layers.1.mlp.experts.39.up_gate_proj.weight', 'model.layers.1.mlp.experts.40.up_gate_proj.weight', 'model.layers.1.mlp.experts.41.up_gate_proj.weight', 'model.layers.1.mlp.experts.42.up_gate_proj.weight', 'model.layers.1.mlp.experts.43.up_gate_proj.weight', 'model.layers.1.mlp.experts.44.up_gate_proj.weight', 'model.layers.1.mlp.experts.45.up_gate_proj.weight', 'model.layers.1.mlp.experts.46.up_gate_proj.weight', 'model.layers.1.mlp.experts.47.up_gate_proj.weight', 'model.layers.1.mlp.experts.48.up_gate_proj.weight', 'model.layers.1.mlp.experts.49.up_gate_proj.weight', 'model.layers.1.mlp.experts.50.up_gate_proj.weight', 'model.layers.1.mlp.experts.51.up_gate_proj.weight', 'model.layers.1.mlp.experts.52.up_gate_proj.weight', 'model.layers.1.mlp.experts.53.up_gate_proj.weight', 'model.layers.1.mlp.experts.54.up_gate_proj.weight', 'model.layers.1.mlp.experts.55.up_gate_proj.weight', 'model.layers.1.mlp.experts.56.up_gate_proj.weight', 'model.layers.1.mlp.experts.57.up_gate_proj.weight', 'model.layers.1.mlp.experts.58.up_gate_proj.weight', 'model.layers.1.mlp.experts.59.up_gate_proj.weight', 'model.layers.1.mlp.experts.60.up_gate_proj.weight', 'model.layers.1.mlp.experts.61.up_gate_proj.weight', 'model.layers.1.mlp.experts.62.up_gate_proj.weight', 'model.layers.1.mlp.experts.63.up_gate_proj.weight', 'model.layers.1.mlp.experts.64.up_gate_proj.weight', 'model.layers.1.mlp.experts.65.up_gate_proj.weight', 'model.layers.1.mlp.experts.66.up_gate_proj.weight', 'model.layers.1.mlp.experts.67.up_gate_proj.weight', 'model.layers.1.mlp.experts.68.up_gate_proj.weight', 'model.layers.1.mlp.experts.69.up_gate_proj.weight', 'model.layers.1.mlp.experts.70.up_gate_proj.weight', 'model.layers.1.mlp.experts.71.up_gate_proj.weight', 'model.layers.1.mlp.experts.72.up_gate_proj.weight', 'model.layers.1.mlp.experts.73.up_gate_proj.weight', 'model.layers.1.mlp.experts.74.up_gate_proj.weight', 'model.layers.1.mlp.experts.75.up_gate_proj.weight', 'model.layers.1.mlp.experts.76.up_gate_proj.weight', 'model.layers.1.mlp.experts.77.up_gate_proj.weight', 'model.layers.1.mlp.experts.78.up_gate_proj.weight', 'model.layers.1.mlp.experts.79.up_gate_proj.weight', 'model.layers.1.mlp.experts.80.up_gate_proj.weight', 'model.layers.1.mlp.experts.81.up_gate_proj.weight', 'model.layers.1.mlp.experts.82.up_gate_proj.weight', 'model.layers.1.mlp.experts.83.up_gate_proj.weight', 'model.layers.1.mlp.experts.84.up_gate_proj.weight', 'model.layers.1.mlp.experts.85.up_gate_proj.weight', 'model.layers.1.mlp.experts.86.up_gate_proj.weight', 'model.layers.1.mlp.experts.87.up_gate_proj.weight', 'model.layers.1.mlp.experts.88.up_gate_proj.weight', 'model.layers.1.mlp.experts.89.up_gate_proj.weight', 'model.layers.1.mlp.experts.90.up_gate_proj.weight', 'model.layers.1.mlp.experts.91.up_gate_proj.weight', 'model.layers.1.mlp.experts.92.up_gate_proj.weight', 'model.layers.1.mlp.experts.93.up_gate_proj.weight', 'model.layers.1.mlp.experts.94.up_gate_proj.weight', 'model.layers.1.mlp.experts.95.up_gate_proj.weight', 'model.layers.1.mlp.experts.96.up_gate_proj.weight', 'model.layers.1.mlp.experts.97.up_gate_proj.weight', 'model.layers.1.mlp.experts.98.up_gate_proj.weight', 'model.layers.1.mlp.experts.99.up_gate_proj.weight', 'model.layers.1.mlp.experts.100.up_gate_proj.weight', 'model.layers.1.mlp.experts.101.up_gate_proj.weight', 'model.layers.1.mlp.experts.102.up_gate_proj.weight', 'model.layers.1.mlp.experts.103.up_gate_proj.weight', 'model.layers.1.mlp.experts.104.up_gate_proj.weight', 'model.layers.1.mlp.experts.105.up_gate_proj.weight', 'model.layers.1.mlp.experts.106.up_gate_proj.weight', 'model.layers.1.mlp.experts.107.up_gate_proj.weight', 'model.layers.1.mlp.experts.108.up_gate_proj.weight', 'model.layers.1.mlp.experts.109.up_gate_proj.weight', 'model.layers.1.mlp.experts.110.up_gate_proj.weight', 'model.layers.1.mlp.experts.111.up_gate_proj.weight', 'model.layers.1.mlp.experts.112.up_gate_proj.weight', 'model.layers.1.mlp.experts.113.up_gate_proj.weight', 'model.layers.1.mlp.experts.114.up_gate_proj.weight', 'model.layers.1.mlp.experts.115.up_gate_proj.weight', 'model.layers.1.mlp.experts.116.up_gate_proj.weight', 'model.layers.1.mlp.experts.117.up_gate_proj.weight', 'model.layers.1.mlp.experts.118.up_gate_proj.weight', 'model.layers.1.mlp.experts.119.up_gate_proj.weight', 'model.layers.1.mlp.experts.120.up_gate_proj.weight', 'model.layers.1.mlp.experts.121.up_gate_proj.weight', 'model.layers.1.mlp.experts.122.up_gate_proj.weight', 'model.layers.1.mlp.experts.123.up_gate_proj.weight', 'model.layers.1.mlp.experts.124.up_gate_proj.weight', 'model.layers.1.mlp.experts.125.up_gate_proj.weight', 'model.layers.1.mlp.experts.126.up_gate_proj.weight', 'model.layers.1.mlp.experts.127.up_gate_proj.weight'] +model.layers.1.mlp.experts.down_proj_weight:['model.layers.1.mlp.experts.0.down_proj.weight', 'model.layers.1.mlp.experts.1.down_proj.weight', 'model.layers.1.mlp.experts.2.down_proj.weight', 'model.layers.1.mlp.experts.3.down_proj.weight', 'model.layers.1.mlp.experts.4.down_proj.weight', 'model.layers.1.mlp.experts.5.down_proj.weight', 'model.layers.1.mlp.experts.6.down_proj.weight', 'model.layers.1.mlp.experts.7.down_proj.weight', 'model.layers.1.mlp.experts.8.down_proj.weight', 'model.layers.1.mlp.experts.9.down_proj.weight', 'model.layers.1.mlp.experts.10.down_proj.weight', 'model.layers.1.mlp.experts.11.down_proj.weight', 'model.layers.1.mlp.experts.12.down_proj.weight', 'model.layers.1.mlp.experts.13.down_proj.weight', 'model.layers.1.mlp.experts.14.down_proj.weight', 'model.layers.1.mlp.experts.15.down_proj.weight', 'model.layers.1.mlp.experts.16.down_proj.weight', 'model.layers.1.mlp.experts.17.down_proj.weight', 'model.layers.1.mlp.experts.18.down_proj.weight', 'model.layers.1.mlp.experts.19.down_proj.weight', 'model.layers.1.mlp.experts.20.down_proj.weight', 'model.layers.1.mlp.experts.21.down_proj.weight', 'model.layers.1.mlp.experts.22.down_proj.weight', 'model.layers.1.mlp.experts.23.down_proj.weight', 'model.layers.1.mlp.experts.24.down_proj.weight', 'model.layers.1.mlp.experts.25.down_proj.weight', 'model.layers.1.mlp.experts.26.down_proj.weight', 'model.layers.1.mlp.experts.27.down_proj.weight', 'model.layers.1.mlp.experts.28.down_proj.weight', 'model.layers.1.mlp.experts.29.down_proj.weight', 'model.layers.1.mlp.experts.30.down_proj.weight', 'model.layers.1.mlp.experts.31.down_proj.weight', 'model.layers.1.mlp.experts.32.down_proj.weight', 'model.layers.1.mlp.experts.33.down_proj.weight', 'model.layers.1.mlp.experts.34.down_proj.weight', 'model.layers.1.mlp.experts.35.down_proj.weight', 'model.layers.1.mlp.experts.36.down_proj.weight', 'model.layers.1.mlp.experts.37.down_proj.weight', 'model.layers.1.mlp.experts.38.down_proj.weight', 'model.layers.1.mlp.experts.39.down_proj.weight', 'model.layers.1.mlp.experts.40.down_proj.weight', 'model.layers.1.mlp.experts.41.down_proj.weight', 'model.layers.1.mlp.experts.42.down_proj.weight', 'model.layers.1.mlp.experts.43.down_proj.weight', 'model.layers.1.mlp.experts.44.down_proj.weight', 'model.layers.1.mlp.experts.45.down_proj.weight', 'model.layers.1.mlp.experts.46.down_proj.weight', 'model.layers.1.mlp.experts.47.down_proj.weight', 'model.layers.1.mlp.experts.48.down_proj.weight', 'model.layers.1.mlp.experts.49.down_proj.weight', 'model.layers.1.mlp.experts.50.down_proj.weight', 'model.layers.1.mlp.experts.51.down_proj.weight', 'model.layers.1.mlp.experts.52.down_proj.weight', 'model.layers.1.mlp.experts.53.down_proj.weight', 'model.layers.1.mlp.experts.54.down_proj.weight', 'model.layers.1.mlp.experts.55.down_proj.weight', 'model.layers.1.mlp.experts.56.down_proj.weight', 'model.layers.1.mlp.experts.57.down_proj.weight', 'model.layers.1.mlp.experts.58.down_proj.weight', 'model.layers.1.mlp.experts.59.down_proj.weight', 'model.layers.1.mlp.experts.60.down_proj.weight', 'model.layers.1.mlp.experts.61.down_proj.weight', 'model.layers.1.mlp.experts.62.down_proj.weight', 'model.layers.1.mlp.experts.63.down_proj.weight', 'model.layers.1.mlp.experts.64.down_proj.weight', 'model.layers.1.mlp.experts.65.down_proj.weight', 'model.layers.1.mlp.experts.66.down_proj.weight', 'model.layers.1.mlp.experts.67.down_proj.weight', 'model.layers.1.mlp.experts.68.down_proj.weight', 'model.layers.1.mlp.experts.69.down_proj.weight', 'model.layers.1.mlp.experts.70.down_proj.weight', 'model.layers.1.mlp.experts.71.down_proj.weight', 'model.layers.1.mlp.experts.72.down_proj.weight', 'model.layers.1.mlp.experts.73.down_proj.weight', 'model.layers.1.mlp.experts.74.down_proj.weight', 'model.layers.1.mlp.experts.75.down_proj.weight', 'model.layers.1.mlp.experts.76.down_proj.weight', 'model.layers.1.mlp.experts.77.down_proj.weight', 'model.layers.1.mlp.experts.78.down_proj.weight', 'model.layers.1.mlp.experts.79.down_proj.weight', 'model.layers.1.mlp.experts.80.down_proj.weight', 'model.layers.1.mlp.experts.81.down_proj.weight', 'model.layers.1.mlp.experts.82.down_proj.weight', 'model.layers.1.mlp.experts.83.down_proj.weight', 'model.layers.1.mlp.experts.84.down_proj.weight', 'model.layers.1.mlp.experts.85.down_proj.weight', 'model.layers.1.mlp.experts.86.down_proj.weight', 'model.layers.1.mlp.experts.87.down_proj.weight', 'model.layers.1.mlp.experts.88.down_proj.weight', 'model.layers.1.mlp.experts.89.down_proj.weight', 'model.layers.1.mlp.experts.90.down_proj.weight', 'model.layers.1.mlp.experts.91.down_proj.weight', 'model.layers.1.mlp.experts.92.down_proj.weight', 'model.layers.1.mlp.experts.93.down_proj.weight', 'model.layers.1.mlp.experts.94.down_proj.weight', 'model.layers.1.mlp.experts.95.down_proj.weight', 'model.layers.1.mlp.experts.96.down_proj.weight', 'model.layers.1.mlp.experts.97.down_proj.weight', 'model.layers.1.mlp.experts.98.down_proj.weight', 'model.layers.1.mlp.experts.99.down_proj.weight', 'model.layers.1.mlp.experts.100.down_proj.weight', 'model.layers.1.mlp.experts.101.down_proj.weight', 'model.layers.1.mlp.experts.102.down_proj.weight', 'model.layers.1.mlp.experts.103.down_proj.weight', 'model.layers.1.mlp.experts.104.down_proj.weight', 'model.layers.1.mlp.experts.105.down_proj.weight', 'model.layers.1.mlp.experts.106.down_proj.weight', 'model.layers.1.mlp.experts.107.down_proj.weight', 'model.layers.1.mlp.experts.108.down_proj.weight', 'model.layers.1.mlp.experts.109.down_proj.weight', 'model.layers.1.mlp.experts.110.down_proj.weight', 'model.layers.1.mlp.experts.111.down_proj.weight', 'model.layers.1.mlp.experts.112.down_proj.weight', 'model.layers.1.mlp.experts.113.down_proj.weight', 'model.layers.1.mlp.experts.114.down_proj.weight', 'model.layers.1.mlp.experts.115.down_proj.weight', 'model.layers.1.mlp.experts.116.down_proj.weight', 'model.layers.1.mlp.experts.117.down_proj.weight', 'model.layers.1.mlp.experts.118.down_proj.weight', 'model.layers.1.mlp.experts.119.down_proj.weight', 'model.layers.1.mlp.experts.120.down_proj.weight', 'model.layers.1.mlp.experts.121.down_proj.weight', 'model.layers.1.mlp.experts.122.down_proj.weight', 'model.layers.1.mlp.experts.123.down_proj.weight', 'model.layers.1.mlp.experts.124.down_proj.weight', 'model.layers.1.mlp.experts.125.down_proj.weight', 'model.layers.1.mlp.experts.126.down_proj.weight', 'model.layers.1.mlp.experts.127.down_proj.weight'] +model.layers.0.self_attn.qkv_proj.bias:model.layers.0.self_attn.qkv_proj.bias +model.layers.0.self_attn.qkv_proj.weight:model.layers.0.self_attn.qkv_proj.weight +model.layers.0.self_attn.o_proj.weight:model.layers.0.self_attn.o_proj.weight +model.layers.0.mlp.up_gate_proj.weight:model.layers.0.mlp.up_gate_proj.weight +model.layers.0.mlp.down_proj.weight:model.layers.0.mlp.down_proj.weight +model.layers.0.input_layernorm.weight:model.layers.0.input_layernorm.weight +model.layers.0.post_attention_layernorm.weight:model.layers.0.post_attention_layernorm.weight +model.layers.1.self_attn.qkv_proj.bias:model.layers.1.self_attn.qkv_proj.bias +model.layers.1.self_attn.qkv_proj.weight:model.layers.1.self_attn.qkv_proj.weight +model.layers.1.self_attn.o_proj.weight:model.layers.1.self_attn.o_proj.weight +model.layers.1.mlp.shared_experts.up_gate_proj.weight:model.layers.1.mlp.shared_experts.up_gate_proj.weight +model.layers.1.mlp.shared_experts.down_proj.weight:model.layers.1.mlp.shared_experts.down_proj.weight +model.layers.1.input_layernorm.weight:model.layers.1.input_layernorm.weight +model.layers.1.post_attention_layernorm.weight:model.layers.1.post_attention_layernorm.weight +model.norm.weight:model.norm.weight diff --git a/tests/ci_use/GLM-45-AIR/test_rollout_model.py b/tests/ci_use/GLM-45-AIR/test_rollout_model.py new file mode 100644 index 000000000..aed74a381 --- /dev/null +++ b/tests/ci_use/GLM-45-AIR/test_rollout_model.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess +import sys + + +def test_rollout_model_with_distributed_launch(): + """ + test_rollout_model + """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + utils_dir = os.path.join(os.path.dirname(current_dir), "utils") + rollout_script = os.path.join(utils_dir, "rollout_model.py") + baseline_path = os.path.join(current_dir, "baseline.txt") + + base_path = os.getenv("MODEL_PATH") + if base_path: + model_path = os.path.join(base_path, "GLM-4.5-Air-Fake") + else: + model_path = "./GLM-4.5-Air-Fake" + print(f"model_path = {model_path}") + + command = [ + sys.executable, + "-m", + "paddle.distributed.launch", + "--gpus", + "0,1", + rollout_script, + "--model_path", + model_path, + "--baseline_path", + baseline_path, + ] + + print(f"Executing command: {' '.join(command)}") + + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + try: + stdout, stderr = process.communicate(timeout=300) + return_code = process.returncode + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + return_code = -1 + + print("\n" + "=" * 50 + " STDOUT " + "=" * 50) + print(stdout) + print("\n" + "=" * 50 + " STDERR " + "=" * 50) + print(stderr) + + assert return_code == 0, f"Process exited with code {return_code}" diff --git a/tests/ci_use/EB_VL_Lite/rollout_model.py b/tests/ci_use/utils/rollout_model.py similarity index 85% rename from tests/ci_use/EB_VL_Lite/rollout_model.py rename to tests/ci_use/utils/rollout_model.py index b68d4c308..3e58933f1 100644 --- a/tests/ci_use/EB_VL_Lite/rollout_model.py +++ b/tests/ci_use/utils/rollout_model.py @@ -23,6 +23,9 @@ _, ranks = init_dist_env() parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to the model directory") +parser.add_argument("--baseline_path", type=str, required=True, help="Path to the baseline path") +parser.add_argument("--quantization", type=str, default=None, help="Quantization") +parser.add_argument("--enable_mm", action="store_true", required=False, help="Flags to enable multi-modal model") args = parser.parse_args() # base result @@ -35,9 +38,11 @@ init_kwargs = { "tensor_parallel_size": ranks, "dynamic_load_weight": True, "load_strategy": "ipc_snapshot", - "enable_mm": True, - "quantization": "wint8", + "quantization": args.quantization, } +if args.enable_mm: + init_kwargs["enable_mm"] = True + rollout_config = RolloutModelConfig(**init_kwargs) actor_eval_model = RolloutModel(rollout_config) @@ -75,7 +80,7 @@ def compare_strings_line_by_line(a: str, b: str) -> bool: return True -with open("baseline.txt", "r", encoding="utf-8") as f: +with open(args.baseline_path, "r", encoding="utf-8") as f: baseline = f.read() assert compare_strings_line_by_line(baseline, content), ( "In the unittest of RL scenario, your modification "