Files
FastDeploy/fastdeploy/config.py
2025-06-09 19:20:15 +08:00

513 lines
19 KiB
Python

"""
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from __future__ import annotations
import json
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
import paddle
from paddlenlp.transformers.configuration_utils import PretrainedConfig
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantConfigBase
from fastdeploy.utils import get_logger
logger = get_logger("config", "config.log")
__all__ = [
"ModelConfig",
]
class GenerationPhase(Enum):
"""
The generation phase of the model.
"""
PREFILL = 1
DECODER = 2
class ModelConfig(PretrainedConfig):
"""
The configuration class to store the configuration of a `LLM`.
"""
model_type = ""
def __init__(
self,
vocab_size: int = 100224,
hidden_size: int = 4096,
intermediate_size: Optional[int] = None,
num_layers: int = 48,
num_attention_heads: int = 32,
num_key_value_heads: Optional[int] = None,
hidden_act: str = "swiglu",
hidden_dropout_prob: float = 0.0,
max_position_embeddings: int = 512,
max_seq_len: int = 512,
initializer_range: float = 0.02,
type_vocab_size: int = 4,
use_rope=True,
use_rmsnorm=False,
weight_sharing=True,
weight_sharing_add_bias=False,
sequence_parallel=False,
use_flash_attention=False,
use_fast_ffn: bool = False,
tensor_parallel_output: bool = True,
fused_linear=False,
compression_ratio: float = 1.0,
rope_theta: int = 10000,
rope_3d: bool = False,
ori_vocab_size: int | None = None,
smooth: bool = False,
group_size: int = -1,
tools_version="4.10.0.dev",
system_prompt_version="V1",
moe_layer_start_index: int | None = None,
moe_use_gate_correction_bias: bool | None = None,
num_hidden_layers: int | None = None,
prefix_name="",
freeze_embedding=False,
rope_head_dim=None,
base_model_prefix=None,
use_moe=False,
ffn_hidden_size: Optional[int] = None,
dtype=None,
export_model_type: str = "default",
use_stop_seqs: bool = False,
return_all_hidden_states: bool = False,
start_layer_index: int = 0,
output_via_mq: bool = True,
generation_phase: GenerationPhase = GenerationPhase.PREFILL,
**kwargs,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_layers = num_layers
if num_hidden_layers is not None:
self.num_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = hidden_size // num_attention_heads
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.type_vocab_size = type_vocab_size
self.use_rope = use_rope
self.use_rmsnorm = use_rmsnorm
self.weight_sharing = weight_sharing
self.weight_sharing_add_bias = weight_sharing_add_bias
self.use_flash_attention = use_flash_attention
self.use_fast_ffn = use_fast_ffn
self.tensor_parallel_output = tensor_parallel_output
self.skip_recompute_ops = dict()
self.fused_linear = fused_linear
self.compression_ratio = compression_ratio
self.rope_theta = rope_theta
self.ori_vocab_size = ori_vocab_size or vocab_size
self.smooth = smooth
self.group_size = group_size
self.max_seq_len = max_seq_len
self.tools_version = tools_version
self.system_prompt_version = system_prompt_version
self.prefix_name = prefix_name
self.freeze_embedding = freeze_embedding
self.rope_head_dim = rope_head_dim
self.use_moe = use_moe
self.base_model_prefix = base_model_prefix
if moe_layer_start_index is not None:
self.moe_layer_start_index = moe_layer_start_index
elif moe_use_gate_correction_bias is not None:
self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
self.ffn_hidden_size = ffn_hidden_size
self.rope_3d = rope_3d
self.export_model_type = export_model_type
self.use_stop_seqs = use_stop_seqs
self.return_all_hidden_states = return_all_hidden_states
self.start_layer_index = start_layer_index
self.output_via_mq = output_via_mq
@dataclass
class MoEConfig:
"""
Configuration for MoE.
"""
use_moe: bool = False
num_experts: int = -1
top_k = 8
moe_intermediate_size: int = -1
num_experts_per_rank: int = -1
num_experts_start_offset: int = -1
activation = "swiglu"
moe_use_gate_correction_bias = False
moe_every2 = (False, )
moe_num_shared_experts = (0, )
moe_layer_start_index = 0
moe_use_ffn_shared_weight_and_bias = (False, )
moe_group = (False, )
moe_quant_type = "default"
num_max_dispatch_tokens_per_rank = 256
has_multimodality: bool = False
im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
)
moe_tag = ""
@dataclass
class ParallelConfig:
"""Configuration for the distributed execution."""
block_size = 16 # The block size for processing.
sequence_parallel = False # Whether to enable sequence parallelism.
use_ep = False # Whether to enable Expert Parallelism
moe_group = False # Whether to enable moe group
msg_queue_id = None # mesage queue id
use_micro_batch = False # Whether to enable micro batch
tensor_parallel_rank = None # TP rank ID
tensor_parallel_degree = None # TP degree
mp_size = 1 # mp size
ep_size = 1 # ep size
column_cut = False # (bool, optional): The embedding weight distributed on your gpu cards is divided by row or column. Defaults to False means divide by row. When vocab_size can not be divided by world_size but hidden_size can, we can consider split embedding weight by column.
lm_head_column_cut = False
@dataclass
class SpeculativeConfig:
"""
Configuration for speculative decoding.
"""
speculate_method = None # speculate method
speculate_max_draft_token_num = 1 # the max length of draft tokens for speculate method
draft_type = "None" # draft type
is_mtp = False # is mtp
speculate_max_candidate_len = 5 # the max length of candidate tokens for speculate method
speculate_verify_window = 2 # the max length of verify window for speculate method
@dataclass
class DeviceConfig:
"""
Configuration for device settings.
"""
@dataclass
class AdditionalConfig:
"""
Configuration for testing, debugging or others
"""
use_fake_parameter = False # use fake parameter for test
ep_just_for_test = True # whether to use ep just for test
fake_server_p = False # whether to use fake server
class WeightKeys:
"""
The parameter keys stored in your model_state.padarams.
"""
def __init__(self, num_layers):
"""
Initialization keys retrive weight from model_state.padarams.
Args:
num_layers (int): Number of layers in the Transformer model.
Returns:
None
"""
self.norm_before_qkv_weight_keys = [None for i in range(num_layers)]
self.norm_before_qkv_bias_keys = [None for i in range(num_layers)]
self.qkv_linear_weight_keys = [None for i in range(num_layers)]
self.qkv_linear_bias_keys = [None for i in range(num_layers)]
self.out_linear_weight_keys = [None for i in range(num_layers)]
self.out_linear_bias_keys = [None for i in range(num_layers)]
self.ffn_layernorm_weight_keys = [None for i in range(num_layers)]
self.ffn_layernorm_bias_keys = [None for i in range(num_layers)]
self.ffn1_weight_keys = [None for i in range(num_layers)]
self.ffn1_bias_keys = [None for i in range(num_layers)]
self.ffn2_weight_keys = [None for i in range(num_layers)]
self.ffn2_bias_keys = [None for i in range(num_layers)]
self.moe_gate_weight_keys = None
self.moe_gate_correction_bias_keys = None
self.moe_ffn1_weight_keys = None
self.moe_ffn2_weight_keys = None
self.moe_ffn1_bias_keys = None
self.moe_ffn2_bias_keys = None
self.moe_ffn1_weight_scale_key = None
self.moe_ffn2_weight_scale_key = None
self.moe_ffn1_expert_in_scale_key = None
self.moe_ffn2_expert_in_scale_key = None
class GraphOptimizationConfig:
"""The Top-level graph optimization contral corresponds to different backends.
- 0: dyncmic graph
- 1: static graph
- 2: static graph + cinn compilation backend
"""
graph_opt_level: int = 0
# CUDA Graph Config
""" Whether to use cudagraph.
- Fasle: cudagraph is not used.
- True: cudagraph is used.
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
"""
use_cudagraph: bool = False
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
cudagraph_capture_sizes: Optional[list[int]] = None
""" Number of warmup runs for cudagraph. """
cudagraph_num_of_warmups: int = 2
"""Whether to copy input tensors for cudagraph.
If the caller can guarantee that the same input buffers
are always used, it can set this to False. Otherwise, it should
set this to True."""
cudagraph_copy_inputs: bool = False
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
CudaGraphBackend will split these operations from the static graph.
Example usage:
cudagraph_splitting_ops = ["paddle.unified_attention"]
Note: If want to use subgraph capture functionality in a dynamic graph,
can manually split the model into multiple layers and apply the @support_cuda_graph decorator
only to the layer where CUDA graph functionality is required.
"""
cudagraph_splitting_ops = Optional[list[str]]
""""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
full_cuda_graph: bool = False
max_capture_size: int = field(default=None, init=False) # type: ignore
batch_size_to_captured_size: dict[int,
int] = field(default=None,
init=False) # type: ignore
# CINN Config ...
def init_with_cudagrpah_size(self,
cudagraph_capture_sizes: list[int]) -> None:
"""To complete the initialization of config,
we need to know the cudagraph sizes"""
if self.cudagraph_capture_sizes is None:
self.cudagraph_capture_sizes = cudagraph_capture_sizes
else:
dedup_sizes = list(set(self.cudagraph_capture_sizes))
if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
logger.info(("cudagraph sizes specified by model runner"
" %s is overridden by config %s"),
cudagraph_capture_sizes, dedup_sizes)
self.cudagraph_capture_sizes = dedup_sizes
# sort to make sure cudagraph capture sizes are in descending order
self.cudagraph_capture_sizes.sort(reverse=True)
self.max_capture_size = self.cudagraph_capture_sizes[
0] if self.cudagraph_capture_sizes else 0
# pre-compute the mapping from batch size to padded graph size
self.batch_size_to_captured_size = [
0 for i in range(self.max_capture_size + 1)
]
for end, start in zip(self.cudagraph_capture_sizes,
self.cudagraph_capture_sizes[1:] + [0]):
for bs in range(start, end):
if bs == start:
self.batch_size_to_captured_size[bs] = start
else:
self.batch_size_to_captured_size[bs] = end
self.batch_size_to_captured_size[
self.max_capture_size] = self.max_capture_size
@dataclass
class LoadConfig:
"""
Configuration for loading parameter
"""
model_path: str = None # The path to the model file.
weight_keys: Optional[
WeightKeys] = None # Keys stored in your model, which is used to retrieve weights from the state dict.
scale_dir: str = None # The directory where the scale file is located.
act_scales = None
bias_keys = None
def _post_init(self, model_config):
if self.weight_keys:
self.weight_keys_mapping = self._create_weight_key_by_layer_name(
model_config)
else:
self.weight_keys_mapping = {}
self.quant_scale_mapping = self._create_quant_scale_mapping(
model_config)
def _create_weight_key_by_layer_name(self, model_config) -> dict:
mapping = {}
weight_keys = self.weight_keys
num_layers = model_config.num_layers
for i in range(num_layers):
if i == 0:
layer_name = f"{model_config.base_model_prefix}.decoder.layers.0.norm1"
mapping[layer_name] = weight_keys.norm_before_qkv_weight_keys[
0]
if i < num_layers:
layer_name = f"{model_config.base_model_prefix}.decoder.layers.{i}.norm2"
mapping[layer_name] = weight_keys.ffn_layernorm_weight_keys[i]
for i in range(num_layers - 1):
layer_name = f"{model_config.base_model_prefix}.decoder.layers.{i+1}.norm1"
mapping[layer_name] = weight_keys.norm_before_qkv_weight_keys[i +
1]
layer_name = f"{model_config.base_model_prefix}.decoder.norm"
if not model_config.use_moe:
mapping[
layer_name] = f"{model_config.base_model_prefix}.decoder.norm.weight"
else:
mapping[layer_name] = "ernie.norm.weight"
layer_name = f"{model_config.base_model_prefix}.e_norm"
mapping[layer_name] = f"{model_config.base_model_prefix}.e_norm.weight"
layer_name = f"{model_config.base_model_prefix}.h_norm"
mapping[layer_name] = f"{model_config.base_model_prefix}.h_norm.weight"
return mapping
def _create_quant_scale_mapping(self, model_config) -> dict:
mapping = {}
act_scales = self.act_scales
num_layers = model_config.num_layers
for i in range(num_layers):
if i == 0:
layer_name = f"{model_config.base_model_prefix}.decoder.layers.0.norm1"
mapping[layer_name] = act_scales.get(
f"{model_config.base_model_prefix}.decoder.layers.0.self_attn.qkv_proj.activation_quanter",
-1)
if i < num_layers:
layer_name = f"{model_config.base_model_prefix}.decoder.layers.{i}.norm2"
mapping[layer_name] = act_scales.get(
f"{model_config.base_model_prefix}.decoder.layers.{i}.linear1.activation_quanter",
-1)
for i in range(num_layers - 1):
layer_name = f"{model_config.base_model_prefix}.decoder.layers.{i+1}.norm1"
mapping[layer_name] = act_scales.get(
f"{model_config.base_model_prefix}.decoder.layers.{i + 1}.self_attn.qkv_proj.activation_quanter",
-1)
return mapping
def get_weight_key_by_layer_name(self, layer_name: str) -> Optional[str]:
return self.weight_keys_mapping.get(layer_name)
def get_quant_scale_by_layer_name(self, layer_name: str) -> Optional[int]:
return self.quant_scale_mapping.get(layer_name)
@dataclass
class LoRAConfig:
""" LoRA Config """
pass
@dataclass
class SchedulerConfig:
""" Scheduler Config """
pass
@dataclass
class KVCacheConfig:
""" KV Cache Config """
block_size: int = 0
enc_dec_block_num: int = 2
kv_cache_ratio: float = 0.75
dtype: str = 'bfloat16'
kvcache_quant_config: Optional[QuantConfigBase] = None
class TmpConfig:
"""
TODO(yuanrisheng):TmpConfig will be moved to other config class when refactor work is relatively complete.
"""
cache_quant_dtype: str = "default"
has_zero_point: bool = False
is_channel_wise: bool = False
weight_block_size: int = 16
use_offline_quant: bool = False
@dataclass
class DecodingConfig:
"""
Configuration for decoding
"""
max_dec_len = 20
min_dec_len = 0
decode_strategy = "sampling"
bos_token_id = None
pad_token_id = None
num_return_sequences: int = 1
@dataclass
class LLMConfig:
"""
The configuration class which contains all fastdeploy-related configuration. This
simplifies passing around the distinct configurations in the codebase.
"""
model_config: ModelConfig = field(default=None, init=True) # type: ignore
parallel_config: ParallelConfig = field(default=None, init=True)
speculative_config: SpeculativeConfig = field(default=None,
init=True) # type: ignore
device_config: DeviceConfig = field(default=None,
init=True) # type: ignore
additional_config: AdditionalConfig = field(default=None,
init=True) # type: ignore
load_config: LoadConfig = field(default=None, init=True) # type: ignore
quant_config: Optional[QuantConfigBase] = None
graph_opt_config: Optional[GraphOptimizationConfig] = None
tmp_config: TmpConfig = field(default=None, init=True)
moe_config: MoEConfig = field(default=None, init=True) # type: ignore
decoding_config: DecodingConfig = field(default=None,
init=True) # type: ignore
kvcache_config: KVCacheConfig = field(default=None,
init=True) # type: ignore