[Feature] block sparse attention (#3668)

* 支持稀疏attn

* fix bug

* code style

* fix moba attn get kv shape

* 修复a100编译

* codestyle

* code style

* code style

* code style

* fix conflict

* 增加单侧

* code style

* 增加eblite 加载时间

* fix bug

* for ci

* for ci

* for ci

* for ci

* 支持mlp block size 128

* 增加小算子单测

* fix 单测 mlp

* 将环境变量加入到config里面

* fix rollout config

* 修复显存

* add test server

* add test server

* fix mlp  最后一层使用full attn
This commit is contained in:
yangjianfengo1
2025-08-29 19:46:30 +08:00
committed by GitHub
parent ccd52b5596
commit 3754a9906d
31 changed files with 6553 additions and 10 deletions

View File

@@ -26,6 +26,7 @@ from fastdeploy.config import (
FDConfig,
GraphOptimizationConfig,
LoadConfig,
MobaAttentionConfig,
ModelConfig,
ParallelConfig,
SpeculativeConfig,
@@ -336,6 +337,10 @@ class EngineArgs:
"""
Configuration for graph optimization backend execution.
"""
moba_attention_config: Optional[Dict[str, Any]] = None
"""
Configuration for moba attention.
"""
enable_logprob: bool = False
"""
@@ -534,6 +539,12 @@ class EngineArgs:
default=EngineArgs.graph_optimization_config,
help="",
)
model_group.add_argument(
"--moba-attention-config",
type=json.loads,
default=EngineArgs.moba_attention_config,
help="",
)
model_group.add_argument(
"--guided-decoding-backend",
type=str,
@@ -929,6 +940,18 @@ class EngineArgs:
graph_optimization_args[k] = v
return GraphOptimizationConfig(graph_optimization_args)
def create_moba_attention_config(self) -> MobaAttentionConfig:
"""
Create and retuan a MobaAttentionConfig object based on the current settings.
"""
attention_args = asdict(self)
if self.moba_attention_config is not None:
for k, v in self.moba_attention_config.items():
attention_args[k] = v
return MobaAttentionConfig(attention_args)
else:
return MobaAttentionConfig(None)
def create_early_stop_config(self) -> EarlyStopConfig:
"""
Create and retuan an EarlyStopConfig object based on the current settings.
@@ -966,6 +989,7 @@ class EngineArgs:
speculative_cfg = self.create_speculative_config()
graph_opt_cfg = self.create_graph_optimization_config()
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
moba_attention_config = self.create_moba_attention_config()
early_stop_cfg = self.create_early_stop_config()
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
@@ -1003,6 +1027,7 @@ class EngineArgs:
max_long_partial_prefills=self.max_long_partial_prefills,
long_prefill_token_threshold=self.long_prefill_token_threshold,
graph_opt_config=graph_opt_cfg,
moba_attention_config=moba_attention_config,
guided_decoding_backend=self.guided_decoding_backend,
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
early_stop_config=early_stop_cfg,

View File

@@ -470,6 +470,7 @@ class LLMEngine:
f" --load_strategy {self.cfg.load_config.load_strategy}"
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
f" --load_choices {self.cfg.load_config.load_choices}"
f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
f" --ips {ips}"
)