[feat] support fa3 backend for pd disaggregated (#2695)

* support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * delete use_fast_ffn
2025-10-04 00:06:38 +08:00 · 2025-07-03 22:33:27 +08:00
parent 00863c43fd
commit 240bdac2a4
26 changed files with 455 additions and 139 deletions
--- a/fastdeploy/model_executor/models/qwen2.py
+++ b/fastdeploy/model_executor/models/qwen2.py
@@ -27,7 +27,7 @@ from fastdeploy.config import FDConfig, ModelConfig
 from fastdeploy.model_executor.graph_optimization.decorator import \
    support_graph_optimization
 from fastdeploy.model_executor.layers.activation import SiluAndMul
-from fastdeploy.model_executor.layers.attention import Attention
+from fastdeploy.model_executor.layers.attention.attention import Attention
 from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
 from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
@@ -55,7 +55,6 @@ class Qwen2MLP(nn.Layer):
            output_size=fd_config.model_config.ffn_hidden_size * 2,
            with_bias=False,
            activation=fd_config.model_config.hidden_act,
-            use_fast_ffn=True,
        )

        self.down_proj = RowParallelLinear(