mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
【FIX】Change the name of sparse attn from moba to plas (#3845)
* 更新文档 * 更新文档 * 更新文档 * 更新文档 * 修改moba为plas * code style * update ci * code style * update ci
This commit is contained in:
@@ -690,63 +690,63 @@ class GraphOptimizationConfig:
|
||||
argument = self.use_cudagraph
|
||||
|
||||
|
||||
class MobaAttentionConfig:
|
||||
class PlasAttentionConfig:
|
||||
def __init__(
|
||||
self,
|
||||
args,
|
||||
):
|
||||
self.moba_encoder_top_k_left: int = None
|
||||
self.moba_encoder_top_k_right: int = None
|
||||
"The sparse topk of encoder attention is located at [moba_encoder_top_k_left, moba_encoder top_k_right]"
|
||||
self.moba_decoder_top_k_left: int = None
|
||||
self.moba_decoder_top_k_right: int = None
|
||||
"The sparse topk of decoder attention is located at [moba_decoder_top_k_left, moba_decoder top_k_right]"
|
||||
self.moba_use_encoder_seq_limit: int = None
|
||||
"When the number of encdoer token is less than moba_use_encoder_seq_limit, it is not sparse"
|
||||
self.moba_use_decoder_seq_limit: int = None
|
||||
"When the number of decdoer token is less than moba_use_decoder_seq_limit, it is not sparse"
|
||||
self.moba_block_size: int = 128
|
||||
self.mlp_weight_name: str = "moba_mlp_weight.safetensors"
|
||||
self.moba_max_seq_length: int = 128 * 1024
|
||||
self.plas_encoder_top_k_left: int = None
|
||||
self.plas_encoder_top_k_right: int = None
|
||||
"The sparse topk of encoder attention is located at [plas_encoder_top_k_left, plas_encoder top_k_right]"
|
||||
self.plas_decoder_top_k_left: int = None
|
||||
self.plas_decoder_top_k_right: int = None
|
||||
"The sparse topk of decoder attention is located at [plas_decoder_top_k_left, plas_decoder top_k_right]"
|
||||
self.plas_use_encoder_seq_limit: int = None
|
||||
"When the number of encdoer token is less than plas_use_encoder_seq_limit, it is not sparse"
|
||||
self.plas_use_decoder_seq_limit: int = None
|
||||
"When the number of decdoer token is less than plas_use_decoder_seq_limit, it is not sparse"
|
||||
self.plas_block_size: int = 128
|
||||
self.mlp_weight_name: str = "plas_attention_mlp_weight.safetensors"
|
||||
self.plas_max_seq_length: int = 128 * 1024
|
||||
if args is not None:
|
||||
for key, value in args.items():
|
||||
if hasattr(self, key):
|
||||
setattr(self, key, value)
|
||||
if self.moba_use_encoder_seq_limit is None and self.moba_encoder_top_k_left is not None:
|
||||
self.moba_use_encoder_seq_limit = self.moba_encoder_top_k_left * self.moba_block_size
|
||||
if self.moba_use_decoder_seq_limit is None and self.moba_decoder_top_k_left is not None:
|
||||
self.moba_use_decoder_seq_limit = self.moba_decoder_top_k_left * self.moba_block_size
|
||||
if self.plas_use_encoder_seq_limit is None and self.plas_encoder_top_k_left is not None:
|
||||
self.plas_use_encoder_seq_limit = self.plas_encoder_top_k_left * self.plas_block_size
|
||||
if self.plas_use_decoder_seq_limit is None and self.plas_decoder_top_k_left is not None:
|
||||
self.plas_use_decoder_seq_limit = self.plas_decoder_top_k_left * self.plas_block_size
|
||||
self.check_legality_parameters()
|
||||
|
||||
def check_legality_parameters(
|
||||
self,
|
||||
) -> None:
|
||||
if self.moba_encoder_top_k_left is not None:
|
||||
assert self.moba_encoder_top_k_left > 0, "moba_encoder_top_k_left must large than 0"
|
||||
if self.plas_encoder_top_k_left is not None:
|
||||
assert self.plas_encoder_top_k_left > 0, "plas_encoder_top_k_left must large than 0"
|
||||
|
||||
if self.moba_encoder_top_k_right is not None:
|
||||
assert self.moba_encoder_top_k_right > 0, "moba_encoder_top_k_right must large than 0"
|
||||
if self.plas_encoder_top_k_right is not None:
|
||||
assert self.plas_encoder_top_k_right > 0, "plas_encoder_top_k_right must large than 0"
|
||||
assert (
|
||||
self.moba_encoder_top_k_right >= self.moba_encoder_top_k_left
|
||||
), "moba_encoder_top_k_right must large than moba_encoder_top_k_left"
|
||||
self.plas_encoder_top_k_right >= self.plas_encoder_top_k_left
|
||||
), "plas_encoder_top_k_right must large than plas_encoder_top_k_left"
|
||||
|
||||
if self.moba_decoder_top_k_left is not None:
|
||||
assert self.moba_decoder_top_k_left > 0, "moba_decoder_top_k_left must large than 0"
|
||||
if self.plas_decoder_top_k_left is not None:
|
||||
assert self.plas_decoder_top_k_left > 0, "plas_decoder_top_k_left must large than 0"
|
||||
|
||||
if self.moba_decoder_top_k_right is not None:
|
||||
assert self.moba_decoder_top_k_right > 0, "moba_decoder_top_k_right must large than 0"
|
||||
if self.plas_decoder_top_k_right is not None:
|
||||
assert self.plas_decoder_top_k_right > 0, "plas_decoder_top_k_right must large than 0"
|
||||
assert (
|
||||
self.moba_decoder_top_k_right >= self.moba_decoder_top_k_left
|
||||
), "moba_decoder_top_k_right must large than moba_decoder_top_k_left"
|
||||
self.plas_decoder_top_k_right >= self.plas_decoder_top_k_left
|
||||
), "plas_decoder_top_k_right must large than plas_decoder_top_k_left"
|
||||
|
||||
if self.moba_use_encoder_seq_limit is not None and self.moba_encoder_top_k_left is not None:
|
||||
assert self.moba_use_encoder_seq_limit >= self.moba_encoder_top_k_left * self.moba_block_size
|
||||
if self.moba_use_decoder_seq_limit is not None and self.moba_decoder_top_k_left is not None:
|
||||
assert self.moba_use_decoder_seq_limit >= self.moba_decoder_top_k_left * self.moba_block_size
|
||||
if self.plas_use_encoder_seq_limit is not None and self.plas_encoder_top_k_left is not None:
|
||||
assert self.plas_use_encoder_seq_limit >= self.plas_encoder_top_k_left * self.plas_block_size
|
||||
if self.plas_use_decoder_seq_limit is not None and self.plas_decoder_top_k_left is not None:
|
||||
assert self.plas_use_decoder_seq_limit >= self.plas_decoder_top_k_left * self.plas_block_size
|
||||
|
||||
def to_json_string(self):
|
||||
"""
|
||||
Convert moba_attention_config to json string.
|
||||
Convert plas_attention_config to json string.
|
||||
"""
|
||||
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
|
||||
|
||||
@@ -1105,7 +1105,7 @@ class FDConfig:
|
||||
decoding_config: DecodingConfig = None,
|
||||
quant_config: QuantConfigBase = None,
|
||||
graph_opt_config: GraphOptimizationConfig = None,
|
||||
moba_attention_config: MobaAttentionConfig = None,
|
||||
plas_attention_config: PlasAttentionConfig = None,
|
||||
speculative_config: SpeculativeConfig = None,
|
||||
tokenizer: str = None,
|
||||
max_model_len: int = 8192,
|
||||
@@ -1140,7 +1140,7 @@ class FDConfig:
|
||||
self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config
|
||||
self.decoding_config: DecodingConfig = decoding_config # type: ignore
|
||||
self.cache_config: CacheConfig = cache_config # type: ignore
|
||||
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
|
||||
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
|
||||
# Initialize cuda graph capture list
|
||||
if self.graph_opt_config.cudagraph_capture_sizes is None:
|
||||
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)
|
||||
|
Reference in New Issue
Block a user