mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-11-01 12:22:53 +08:00
@@ -387,7 +387,7 @@ class EngineArgs:
|
||||
Configuration for early stop.
|
||||
"""
|
||||
|
||||
load_choices: str = "default"
|
||||
load_choices: str = "default_v1"
|
||||
"""The format of the model weights to load.
|
||||
Options include:
|
||||
- "default": default loader.
|
||||
@@ -715,7 +715,7 @@ class EngineArgs:
|
||||
type=str,
|
||||
default=EngineArgs.load_choices,
|
||||
help="The format of the model weights to load.\
|
||||
default/new_loader.",
|
||||
default/default_v1.",
|
||||
)
|
||||
|
||||
# CacheConfig parameters group
|
||||
|
||||
@@ -184,12 +184,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||
|
||||
if current_platform.is_cuda():
|
||||
self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2]
|
||||
self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size]
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
|
||||
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}}
|
||||
else:
|
||||
self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size]
|
||||
self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size]
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
|
||||
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
|
||||
|
||||
layer.up_gate_proj_weight = layer.create_parameter(
|
||||
@@ -209,7 +217,6 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
|
||||
{
|
||||
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
|
||||
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
|
||||
"model_format": extra_weight_attrs.get("model_format", ""),
|
||||
},
|
||||
)
|
||||
set_weight_attrs(
|
||||
@@ -217,6 +224,5 @@ class UnquantizedFusedMoEMethod(MoEMethodBase):
|
||||
{
|
||||
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
|
||||
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
|
||||
"model_format": extra_weight_attrs.get("model_format", ""),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -14,14 +14,18 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import paddle
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
class BitMaskTracker:
|
||||
@@ -194,6 +198,53 @@ def default_weight_loader(fd_config: FDConfig) -> None:
|
||||
return fn
|
||||
|
||||
|
||||
def is_pre_sliced_weight(model_path):
|
||||
rank_dirs = [
|
||||
f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f))
|
||||
]
|
||||
return len(rank_dirs) > 1
|
||||
|
||||
|
||||
def v1_loader_support(fd_config):
|
||||
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration"]
|
||||
|
||||
def _err_msg(msg: str) -> str:
|
||||
logger.info(msg + "; fallback to the v0 loader for model loading.")
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
_err_msg("v1loader currently does not support backends other than CUDA")
|
||||
return False
|
||||
|
||||
if is_pre_sliced_weight(fd_config.model_config.model):
|
||||
_err_msg("v1 loader currently does not support pre-sliced weights")
|
||||
return False
|
||||
|
||||
if fd_config.parallel_config.use_ep:
|
||||
_err_msg("v1 loader currently does not support expert parallelism")
|
||||
return False
|
||||
|
||||
if envs.FD_MOE_BACKEND.lower() == "marlin":
|
||||
_err_msg("v1 loader currently does not support marlin backend")
|
||||
return False
|
||||
|
||||
if fd_config.quant_config is not None:
|
||||
if fd_config.quant_config.name() == "mix_quant":
|
||||
moe_quant_type = fd_config.quant_config.moe_quant_type
|
||||
dense_quant_type = fd_config.quant_config.dense_quant_type
|
||||
else:
|
||||
moe_quant_type = fd_config.quant_config.name()
|
||||
dense_quant_type = fd_config.quant_config.name()
|
||||
unsupported_quant = {"w4a8", "w4afp8", "wint2"}
|
||||
|
||||
if unsupported_quant & {moe_quant_type, dense_quant_type}:
|
||||
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
|
||||
return False
|
||||
if fd_config.model_config.architectures[0] in _v1_no_support_archs:
|
||||
_err_msg(f"v1 loader currently does not support {fd_config.model_config.architectures[0]}")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_dtype(dtype: str):
|
||||
"""Temporarily set Paddle default dtype"""
|
||||
|
||||
@@ -44,6 +44,7 @@ from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
||||
from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
|
||||
from fastdeploy.model_executor.layers.quantization import parse_quant_config
|
||||
from fastdeploy.model_executor.utils import v1_loader_support
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.utils import get_logger, optional_type
|
||||
@@ -812,7 +813,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
plas_attention_config=plas_attention_config,
|
||||
)
|
||||
update_fd_config_for_mm(fd_config)
|
||||
|
||||
if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config):
|
||||
fd_config.load_config.load_choices = "default"
|
||||
return fd_config
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user