【FIX】Change the name of sparse attn from moba to plas (#3845)

* 更新文档

* 更新文档

* 更新文档

* 更新文档

* 修改moba为plas

* code style

* update ci

* code style

* update ci
This commit is contained in:
yangjianfengo1
2025-09-09 10:56:50 +08:00
committed by GitHub
parent de34222842
commit e31c8f7336
13 changed files with 150 additions and 150 deletions

View File

@@ -690,63 +690,63 @@ class GraphOptimizationConfig:
argument = self.use_cudagraph
class MobaAttentionConfig:
class PlasAttentionConfig:
def __init__(
self,
args,
):
self.moba_encoder_top_k_left: int = None
self.moba_encoder_top_k_right: int = None
"The sparse topk of encoder attention is located at [moba_encoder_top_k_left, moba_encoder top_k_right]"
self.moba_decoder_top_k_left: int = None
self.moba_decoder_top_k_right: int = None
"The sparse topk of decoder attention is located at [moba_decoder_top_k_left, moba_decoder top_k_right]"
self.moba_use_encoder_seq_limit: int = None
"When the number of encdoer token is less than moba_use_encoder_seq_limit, it is not sparse"
self.moba_use_decoder_seq_limit: int = None
"When the number of decdoer token is less than moba_use_decoder_seq_limit, it is not sparse"
self.moba_block_size: int = 128
self.mlp_weight_name: str = "moba_mlp_weight.safetensors"
self.moba_max_seq_length: int = 128 * 1024
self.plas_encoder_top_k_left: int = None
self.plas_encoder_top_k_right: int = None
"The sparse topk of encoder attention is located at [plas_encoder_top_k_left, plas_encoder top_k_right]"
self.plas_decoder_top_k_left: int = None
self.plas_decoder_top_k_right: int = None
"The sparse topk of decoder attention is located at [plas_decoder_top_k_left, plas_decoder top_k_right]"
self.plas_use_encoder_seq_limit: int = None
"When the number of encdoer token is less than plas_use_encoder_seq_limit, it is not sparse"
self.plas_use_decoder_seq_limit: int = None
"When the number of decdoer token is less than plas_use_decoder_seq_limit, it is not sparse"
self.plas_block_size: int = 128
self.mlp_weight_name: str = "plas_attention_mlp_weight.safetensors"
self.plas_max_seq_length: int = 128 * 1024
if args is not None:
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
if self.moba_use_encoder_seq_limit is None and self.moba_encoder_top_k_left is not None:
self.moba_use_encoder_seq_limit = self.moba_encoder_top_k_left * self.moba_block_size
if self.moba_use_decoder_seq_limit is None and self.moba_decoder_top_k_left is not None:
self.moba_use_decoder_seq_limit = self.moba_decoder_top_k_left * self.moba_block_size
if self.plas_use_encoder_seq_limit is None and self.plas_encoder_top_k_left is not None:
self.plas_use_encoder_seq_limit = self.plas_encoder_top_k_left * self.plas_block_size
if self.plas_use_decoder_seq_limit is None and self.plas_decoder_top_k_left is not None:
self.plas_use_decoder_seq_limit = self.plas_decoder_top_k_left * self.plas_block_size
self.check_legality_parameters()
def check_legality_parameters(
self,
) -> None:
if self.moba_encoder_top_k_left is not None:
assert self.moba_encoder_top_k_left > 0, "moba_encoder_top_k_left must large than 0"
if self.plas_encoder_top_k_left is not None:
assert self.plas_encoder_top_k_left > 0, "plas_encoder_top_k_left must large than 0"
if self.moba_encoder_top_k_right is not None:
assert self.moba_encoder_top_k_right > 0, "moba_encoder_top_k_right must large than 0"
if self.plas_encoder_top_k_right is not None:
assert self.plas_encoder_top_k_right > 0, "plas_encoder_top_k_right must large than 0"
assert (
self.moba_encoder_top_k_right >= self.moba_encoder_top_k_left
), "moba_encoder_top_k_right must large than moba_encoder_top_k_left"
self.plas_encoder_top_k_right >= self.plas_encoder_top_k_left
), "plas_encoder_top_k_right must large than plas_encoder_top_k_left"
if self.moba_decoder_top_k_left is not None:
assert self.moba_decoder_top_k_left > 0, "moba_decoder_top_k_left must large than 0"
if self.plas_decoder_top_k_left is not None:
assert self.plas_decoder_top_k_left > 0, "plas_decoder_top_k_left must large than 0"
if self.moba_decoder_top_k_right is not None:
assert self.moba_decoder_top_k_right > 0, "moba_decoder_top_k_right must large than 0"
if self.plas_decoder_top_k_right is not None:
assert self.plas_decoder_top_k_right > 0, "plas_decoder_top_k_right must large than 0"
assert (
self.moba_decoder_top_k_right >= self.moba_decoder_top_k_left
), "moba_decoder_top_k_right must large than moba_decoder_top_k_left"
self.plas_decoder_top_k_right >= self.plas_decoder_top_k_left
), "plas_decoder_top_k_right must large than plas_decoder_top_k_left"
if self.moba_use_encoder_seq_limit is not None and self.moba_encoder_top_k_left is not None:
assert self.moba_use_encoder_seq_limit >= self.moba_encoder_top_k_left * self.moba_block_size
if self.moba_use_decoder_seq_limit is not None and self.moba_decoder_top_k_left is not None:
assert self.moba_use_decoder_seq_limit >= self.moba_decoder_top_k_left * self.moba_block_size
if self.plas_use_encoder_seq_limit is not None and self.plas_encoder_top_k_left is not None:
assert self.plas_use_encoder_seq_limit >= self.plas_encoder_top_k_left * self.plas_block_size
if self.plas_use_decoder_seq_limit is not None and self.plas_decoder_top_k_left is not None:
assert self.plas_use_decoder_seq_limit >= self.plas_decoder_top_k_left * self.plas_block_size
def to_json_string(self):
"""
Convert moba_attention_config to json string.
Convert plas_attention_config to json string.
"""
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
@@ -1105,7 +1105,7 @@ class FDConfig:
decoding_config: DecodingConfig = None,
quant_config: QuantConfigBase = None,
graph_opt_config: GraphOptimizationConfig = None,
moba_attention_config: MobaAttentionConfig = None,
plas_attention_config: PlasAttentionConfig = None,
speculative_config: SpeculativeConfig = None,
tokenizer: str = None,
max_model_len: int = 8192,
@@ -1140,7 +1140,7 @@ class FDConfig:
self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config
self.decoding_config: DecodingConfig = decoding_config # type: ignore
self.cache_config: CacheConfig = cache_config # type: ignore
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)