support fa3 rope3d (#3622)

2025-10-30 11:26:39 +08:00 · 2025-08-27 11:31:29 +08:00
parent 85afa72763
commit ad319a87cc
4 changed files with 17 additions and 7 deletions
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -311,6 +311,7 @@ class FlashAttentionBackend(AttentionBackend):
                metadata.kv_token_num_cpu[0].item(),
                self.max_seq_len,
                getattr(layer, "cache_quant_type_str", "none"),
+                self.rope_3d,
            )

            res_encoder = self.flash_attn_func(
--- a/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py
+++ b/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py
@@ -49,6 +49,7 @@ def gqa_rope_write_cache(
    kv_token_num: int = 1,
    max_seq_len: int = 0,
    cache_quant_type: str = "none",
+    rope_3d: bool = False,
 ):
    if current_platform.is_cuda():
        from fastdeploy.model_executor.ops.gpu import gqa_rope_write_cache
@@ -81,6 +82,7 @@ def gqa_rope_write_cache(
            kv_token_num,
            max_seq_len,
            cache_quant_type,
+            rope_3d,
        )
        return q, k, v, qkv_
    else: