[feat] support fa3 backend for pd disaggregated (#2695)

* support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * delete use_fast_ffn
2025-10-05 16:48:03 +08:00 · 2025-07-03 22:33:27 +08:00
parent 00863c43fd
commit 240bdac2a4
26 changed files with 455 additions and 139 deletions
--- a/fastdeploy/platforms/cuda.py
+++ b/fastdeploy/platforms/cuda.py
@@ -13,9 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-"""
-cuda platform file
-"""

 import paddle

@@ -65,6 +62,11 @@ class CUDAPlatform(Platform):
            return (
                "fastdeploy.model_executor.layers.attention.MLAAttentionBackend"
            )
+        elif selected_backend == _Backend.FLASH_ATTN:
+            logger.info("Using FLASH ATTN backend.")
+            return (
+                "fastdeploy.model_executor.layers.attention.FlashAttentionBackend"
+            )
        else:
            raise ValueError(
                "Invalid attention backend you specified.\n"