From 3a15e0c53eae4e143d4be6af5712d6648515b834 Mon Sep 17 00:00:00 2001 From: yangjianfengo1 <125249383+yangjianfengo1@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:24:27 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Fix=20Bug=E3=80=91=20=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=20fa3=20=E6=94=AF=E6=8C=81=E9=9B=86=E4=B8=AD=E5=BC=8F?= =?UTF-8?q?bug=20(#3235)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix fa3 集中式bug * 增加qknorm参数 --- .../model_executor/layers/attention/flash_attn_backend.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index cfcf9ef92..ed9248393 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -344,7 +344,7 @@ class FlashAttentionBackend(AttentionBackend): forward_meta.decoder_batch_ids, # from buffer forward_meta.decoder_tile_ids_per_batch, # from buffer forward_meta.decoder_num_blocks_cpu, - forward_meta.max_len_tensor_cpu, + metadata.max_len_tensor_cpu_decoder, metadata.max_len_kv, metadata.rotary_embs, forward_meta.attn_mask, @@ -359,6 +359,9 @@ class FlashAttentionBackend(AttentionBackend): layer.linear_shift, layer.linear_smooth, metadata.kv_signal_data_list[layer.layer_id], + getattr(layer, "q_norm_weight", None), + getattr(layer, "k_norm_weight", None), + getattr(layer, "rms_norm_eps", 1e-6), metadata._fuse_kernel_compute_dtype, getattr(layer, "cache_quant_type_str", "none"), layer.use_neox_rotary_style,