[Iluvatar GPU] Optimze attention and moe performance (#3234)

This commit is contained in:
yzwu
2025-08-08 10:51:24 +08:00
committed by GitHub
parent 37569cca86
commit fbdd6b0663
24 changed files with 1130 additions and 1653 deletions

View File

@@ -39,8 +39,11 @@ def paged_attention(
softcap: float = 0.0,
use_cuda_graph: bool = False,
use_sqrt_alibi: bool = False,
merged_qkv: bool = False,
k: paddle.Tensor = None,
v: paddle.Tensor = None,
rope_sin: paddle.Tensor = None,
rope_cos: paddle.Tensor = None,
):
output = paged_attn(
q,
@@ -51,6 +54,8 @@ def paged_attention(
alibi_slopes,
k,
v,
rope_sin,
rope_cos,
num_kv_heads,
scale,
block_size,
@@ -61,5 +66,6 @@ def paged_attention(
softcap,
use_cuda_graph,
use_sqrt_alibi,
merged_qkv,
)
return output[0] if isinstance(output, list) else output