Revert "[Feature] block sparse attention (#3209)" (#3647)

This reverts commit 646a0c2fd8.
2025-10-18 22:44:39 +08:00 · 2025-08-27 17:35:04 +08:00
parent b2afdf4fc6
commit c694fa2879
31 changed files with 10 additions and 6507 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -26,7 +26,6 @@ from fastdeploy.config import (
    FDConfig,
    GraphOptimizationConfig,
    LoadConfig,
-    MobaAttentionConfig,
    ModelConfig,
    ParallelConfig,
    SpeculativeConfig,
@@ -337,10 +336,6 @@ class EngineArgs:
    """
    Configuration for graph optimization backend execution.
    """
-    moba_attention_config: Optional[Dict[str, Any]] = None
-    """
-    Configuration for moba attention.
-    """

    enable_logprob: bool = False
    """
@@ -539,12 +534,6 @@ class EngineArgs:
            default=EngineArgs.graph_optimization_config,
            help="",
        )
-        model_group.add_argument(
-            "--moba-attention-config",
-            type=json.loads,
-            default=EngineArgs.moba_attention_config,
-            help="",
-        )
        model_group.add_argument(
            "--guided-decoding-backend",
            type=str,
@@ -940,18 +929,6 @@ class EngineArgs:
                graph_optimization_args[k] = v
        return GraphOptimizationConfig(graph_optimization_args)

-    def create_moba_attention_config(self) -> MobaAttentionConfig:
-        """
-        Create and retuan a MobaAttentionConfig object based on the current settings.
-        """
-        attention_args = asdict(self)
-        if self.moba_attention_config is not None:
-            for k, v in self.moba_attention_config.items():
-                attention_args[k] = v
-            return MobaAttentionConfig(attention_args)
-        else:
-            return MobaAttentionConfig(None)
-
    def create_early_stop_config(self) -> EarlyStopConfig:
        """
        Create and retuan an EarlyStopConfig object based on the current settings.
@@ -989,7 +966,6 @@ class EngineArgs:
        speculative_cfg = self.create_speculative_config()
        graph_opt_cfg = self.create_graph_optimization_config()
        graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
-        moba_attention_config = self.create_moba_attention_config()

        early_stop_cfg = self.create_early_stop_config()
        early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
@@ -1027,7 +1003,6 @@ class EngineArgs:
            max_long_partial_prefills=self.max_long_partial_prefills,
            long_prefill_token_threshold=self.long_prefill_token_threshold,
            graph_opt_config=graph_opt_cfg,
-            moba_attention_config=moba_attention_config,
            guided_decoding_backend=self.guided_decoding_backend,
            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
            early_stop_config=early_stop_cfg,
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -464,7 +464,6 @@ class LLMEngine:
            f" --load_strategy {self.cfg.load_config.load_strategy}"
            f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
            f" --load_choices {self.cfg.load_config.load_choices}"
-            f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
            f" --ips {ips}"
        )