V1 loader default (#4251)

* v1 laoder

* update

* update
This commit is contained in:
bukejiyu
2025-10-15 16:49:17 +08:00
committed by GitHub
parent e98c1c2f47
commit bcaa98ff9c
4 changed files with 68 additions and 9 deletions

View File

@@ -387,7 +387,7 @@ class EngineArgs:
Configuration for early stop.
"""
load_choices: str = "default"
load_choices: str = "default_v1"
"""The format of the model weights to load.
Options include:
- "default": default loader.
@@ -715,7 +715,7 @@ class EngineArgs:
type=str,
default=EngineArgs.load_choices,
help="The format of the model weights to load.\
default/new_loader.",
default/default_v1.",
)
# CacheConfig parameters group

View File

@@ -184,12 +184,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
if current_platform.is_cuda():
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size * 2,
]
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
else:
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size,
]
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
layer.up_gate_proj_weight = layer.create_parameter(
@@ -209,7 +217,6 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
{
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
"model_format": extra_weight_attrs.get("model_format", ""),
},
)
set_weight_attrs(
@@ -217,6 +224,5 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
{
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
"model_format": extra_weight_attrs.get("model_format", ""),
},
)

View File

@@ -14,14 +14,18 @@
# limitations under the License.
"""
import os
import re
from contextlib import contextmanager
from typing import Any, Optional, Union
import paddle
from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.platforms import current_platform
class BitMaskTracker:
@@ -194,6 +198,53 @@ def default_weight_loader(fd_config: FDConfig) -> None:
return fn
def is_pre_sliced_weight(model_path):
rank_dirs = [
f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f))
]
return len(rank_dirs) > 1
def v1_loader_support(fd_config):
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
def _err_msg(msg: str) -> str:
logger.info(msg + "; fallback to the v0 loader for model loading.")
if not current_platform.is_cuda():
_err_msg("v1loader currently does not support backends other than CUDA")
return False
if is_pre_sliced_weight(fd_config.model_config.model):
_err_msg("v1 loader currently does not support pre-sliced weights")
return False
if fd_config.parallel_config.use_ep:
_err_msg("v1 loader currently does not support expert parallelism")
return False
if envs.FD_MOE_BACKEND.lower() == "marlin":
_err_msg("v1 loader currently does not support marlin backend")
return False
if fd_config.quant_config is not None:
if fd_config.quant_config.name() == "mix_quant":
moe_quant_type = fd_config.quant_config.moe_quant_type
dense_quant_type = fd_config.quant_config.dense_quant_type
else:
moe_quant_type = fd_config.quant_config.name()
dense_quant_type = fd_config.quant_config.name()
unsupported_quant = {"w4a8", "w4afp8", "wint2"}
if unsupported_quant & {moe_quant_type, dense_quant_type}:
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
return False
if fd_config.model_config.architectures[0] in _v1_no_support_archs:
_err_msg(f"v1 loader currently does not support {fd_config.model_config.architectures[0]}")
return False
return True
@contextmanager
def temporary_dtype(dtype: str):
"""Temporarily set Paddle default dtype"""

View File

@@ -44,6 +44,7 @@ from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
from fastdeploy.model_executor.layers.quantization import parse_quant_config
from fastdeploy.model_executor.utils import v1_loader_support
from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.utils import get_logger, optional_type
@@ -812,7 +813,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
plas_attention_config=plas_attention_config,
)
update_fd_config_for_mm(fd_config)
if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config):
fd_config.load_config.load_choices = "default"
return fd_config