support qk norm (#3145)

2025-10-04 16:22:57 +08:00 · 2025-08-05 16:46:14 +08:00
parent 4a10e29804
commit 7ce00e597c
17 changed files with 791 additions and 201 deletions
--- a/custom_ops/gpu_ops/append_attention.cu
+++ b/custom_ops/gpu_ops/append_attention.cu
@@ -73,6 +73,9 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
    const paddle::optional<paddle::Tensor>& out_linear_shifts,
    const paddle::optional<paddle::Tensor>& out_linear_smooths,
    const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps,
    const std::string& cache_quant_type_str,
    const bool use_neox_rotary_style,
    const bool rope_3d,
@@ -223,7 +226,10 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
          main_stream,
          &qkv_out,
          const_cast<paddle::Tensor*>(&key_cache),
-          const_cast<paddle::Tensor*>(&value_cache));
+          const_cast<paddle::Tensor*>(&value_cache),
+          q_norm_weight,
+          k_norm_weight,
+          rms_norm_eps);
    };

    if (qkv_out_scales) {
@@ -339,7 +345,10 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            exec_stream,
            &qkv_out,
            const_cast<paddle::Tensor*>(&key_cache),
-            const_cast<paddle::Tensor*>(&value_cache));
+            const_cast<paddle::Tensor*>(&value_cache),
+            q_norm_weight,
+            k_norm_weight,
+            rms_norm_eps);
      } else {
        DecoderWriteCacheWithRoPEKernel<data_t, data_t>(
            meta_data,
@@ -363,7 +372,10 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
            exec_stream,
            &qkv_out,
            const_cast<paddle::Tensor*>(&key_cache),
-            const_cast<paddle::Tensor*>(&value_cache));
+            const_cast<paddle::Tensor*>(&value_cache),
+            q_norm_weight,
+            k_norm_weight,
+            rms_norm_eps);
      }
    }

@@ -430,6 +442,9 @@ std::vector<paddle::Tensor> AppendAttention(
    const paddle::optional<paddle::Tensor>& out_linear_shifts,
    const paddle::optional<paddle::Tensor>& out_linear_smooths,
    const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps,
    const std::string& compute_dtype,
    const std::string& cache_quant_type_str,
    const bool use_neox_rotary_style,
@@ -500,6 +515,9 @@ std::vector<paddle::Tensor> AppendAttention(
          out_linear_shifts,
          out_linear_smooths,
          kv_signal_data,
+          q_norm_weight,
+          k_norm_weight,
+          rms_norm_eps,
          cache_quant_type_str,
          use_neox_rotary_style,
          rope_3d,
@@ -577,6 +595,9 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
    const paddle::optional<std::vector<int64_t>>& out_linear_shifts_shape,
    const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
    const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
+    const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
+    const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
+    const float rms_norm_eps,
    const std::string& compute_dtype,
    const std::string& cache_quant_type_str,
    const bool use_neox_rotary_style,
@@ -637,6 +658,9 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
    const paddle::optional<paddle::DataType>& out_linear_shifts_dtype,
    const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
    const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
+    const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
+    const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
+    const float rms_norm_eps,
    const std::string& compute_dtype,
    const std::string& cache_quant_type_str,
    const bool use_neox_rotary_style,
@@ -714,7 +738,9 @@ PD_BUILD_STATIC_OP(append_attention)
             paddle::Optional("cache_v_zp"),
             paddle::Optional("out_linear_shifts"),
             paddle::Optional("out_linear_smooths"),
-             paddle::Optional("kv_signal_data")})
+             paddle::Optional("kv_signal_data"),
+              paddle::Optional("q_norm_weight"),
+              paddle::Optional("k_norm_weight")})
    .Outputs({"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"})
    .SetInplaceMap({{"key_cache", "key_cache_out"},
                    {"value_cache", "value_cache_out"}})
@@ -732,7 +758,8 @@ PD_BUILD_STATIC_OP(append_attention)
            "encoder_max_partition_size: int",
            "speculate_max_draft_token_num: int",
            "causal: bool",
-            "speculate_decoder: bool"})
+            "speculate_decoder: bool",
+            "rms_norm_eps: float"})
    .SetKernelFn(PD_KERNEL(AppendAttention))
    .SetInferShapeFn(PD_INFER_SHAPE(AppendAttentionInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(AppendAttentionInferDtype));
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
@@ -18,6 +18,142 @@
 #include "mma_tensor_op.cuh"
 #include "utils.cuh"

+template <typename T, int VecSize = 1>
+__global__ void append_decode_cache_T_rope_qk_norm_kernel(
+    const T* __restrict__ quant_qkv,  // [bsz, num_heads + 2 * kv_num_heads,
+                                      // head_size]
+    T* __restrict__ key_cache,    // [num_blocks, kv_num_heads, block_size,
+                                  // head_size // 2]
+    T* __restrict__ value_cache,  // [num_blocks, kv_num_heads, block_size,
+                                  // head_size // 2]
+    T* __restrict__ qkv_out,
+    const int* __restrict__ block_tables,     // [bsz, max_blocks_per_seq]
+    const int* __restrict__ batch_id_per_token,  // [num_tokens]
+    const int* __restrict__ cu_seqlens_q,
+    const int* __restrict__ seq_lens,          // [bsz]
+    const int* __restrict__ seq_lens_encoder,  // [bsz]
+    const float* __restrict__ cos_emb,
+    const float* __restrict__ sin_emb,
+    const int max_seq_len,
+    const int max_blocks_per_seq,
+    const int num_heads,
+    const int head_size,
+    const int block_size,
+    const uint32_t elem_cnt,
+    const int kv_num_heads,
+    const bool rope_3d,
+    const T* q_norm_weight,
+    const T* k_norm_weight,
+    const float rms_norm_eps) {
+  using LoadT = AlignedVector<T, VecSize>;
+  using LoadBiasT = AlignedVector<T, VecSize>;
+  using LoadKVT = AlignedVector<T, VecSize>;
+  constexpr int HalfVecSize = VecSize / 2;
+  using LoadEmbT = AlignedVector<float, HalfVecSize>;
+  LoadT src_vec;
+  LoadBiasT out_vec;
+  LoadKVT cache_vec;
+  LoadEmbT cos_emb_vec;
+  LoadEmbT sin_emb_vec;
+
+  int64_t global_warp_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t all_warp_num = gridDim.x * blockDim.x;
+  int64_t all_head_dim = elem_cnt / head_size;
+
+  const int64_t hidden_size = (num_heads + 2 * kv_num_heads) * head_size;
+  // const int64_t offset = 2 * hidden_size;
+  const int half_head_size = head_size / 2;
+  for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_dim; gloabl_hi += all_warp_num) {
+    int64_t linear_index = gloabl_hi * head_size + threadIdx.y * VecSize;
+    const int ori_bi = linear_index / hidden_size;
+    const int bias = linear_index % hidden_size;
+    const int hi = bias / head_size;  // q + k + v
+    const int h_bias = bias % head_size;
+    const int start_token_idx = cu_seqlens_q[ori_bi];
+    if (seq_lens_encoder[ori_bi] > 0) return;
+    const int write_seq_id = seq_lens[ori_bi];
+    if (write_seq_id == 0) continue;
+
+    const int* block_table_now = nullptr;
+
+    block_table_now = block_tables + ori_bi * max_blocks_per_seq;
+    const int block_idx = block_table_now[write_seq_id / block_size];
+    const int block_offset = write_seq_id % block_size;
+    const uint32_t ori_idx =
+        start_token_idx * hidden_size + hi * head_size + h_bias;
+
+    const int bias_idx = hi * head_size + h_bias;
+    Load<T, VecSize>(&quant_qkv[ori_idx], &src_vec);
+    if (hi < num_heads + kv_num_heads) {
+      // q k rope
+      const uint32_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
+      uint32_t new_emb_idx = rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
+      Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
+      Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
+    }
+    float thread_m2 = 0.0f;
+    float warp_m2 = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < HalfVecSize; i++) {
+      // dequant + add_bias + rope
+      float input_left = static_cast<float>(src_vec[2 * i]);
+      float input_right = static_cast<float>(src_vec[2 * i + 1]);
+
+      if (hi < num_heads + kv_num_heads) {
+        const float cos_tmp = cos_emb_vec[i];
+        const float sin_tmp = sin_emb_vec[i];
+        float tmp1 = input_left * cos_tmp - input_right * sin_tmp;
+        float tmp2 = input_right * cos_tmp + input_left * sin_tmp;
+        thread_m2 += tmp1 * tmp1 + tmp2 * tmp2;
+        out_vec[2 * i] =
+            static_cast<T>(tmp1);
+        out_vec[2 * i + 1] =
+            static_cast<T>(tmp2);
+      } else {
+        out_vec[2 * i] = src_vec[2 * i];
+        out_vec[2 * i + 1] = src_vec[2 * i + 1];
+      }
+    }
+    if (hi < (num_heads + kv_num_heads)) { // q k
+      WelfordWarpAllReduce<float, 32>(thread_m2, &warp_m2);
+      float row_variance =
+          max(warp_m2 / head_size, 0.0f);
+      float row_inv_var = Rsqrt(row_variance + rms_norm_eps);
+          LoadT q_norm_vec, k_norm_vec;
+      if (hi < num_heads) { // q
+        Load<T, VecSize>(&q_norm_weight[threadIdx.y * VecSize], &q_norm_vec);
+        #pragma unroll
+        for (int i = 0; i < VecSize; i++) {
+          out_vec[i] = static_cast<T>(static_cast<float>(out_vec[i]) * row_inv_var * static_cast<float>(q_norm_vec[i]));
+        }
+      } else { // k
+        Load<T, VecSize>(&k_norm_weight[threadIdx.y * VecSize], &k_norm_vec);
+        for (int i = 0; i < VecSize; i++) {
+          out_vec[i] = static_cast<T>(static_cast<float>(out_vec[i]) * row_inv_var * static_cast<float>(k_norm_vec[i]));
+        }
+      }
+    }
+    if (hi < num_heads) {
+      // write q
+      Store<T, VecSize>(out_vec, &qkv_out[ori_idx]);
+    } else {
+      // quant + write k/v
+      const uint32_t kv_head_idx = (hi - num_heads) % kv_num_heads;
+      const uint32_t tgt_idx =
+          block_idx * kv_num_heads * block_size * head_size +
+          kv_head_idx * block_size * head_size + block_offset * head_size +
+          h_bias;
+      if (hi < num_heads + kv_num_heads) {
+        Store<T, VecSize>(out_vec, &key_cache[tgt_idx]);
+      } else {
+        Store<T, VecSize>(out_vec, &value_cache[tgt_idx]);
+      }
+    }
+
+  }
+}
+
 template <typename T, int VecSize = 1>
 __global__ void append_decode_cache_T_rope_kernel(
    const T* __restrict__ quant_qkv,  // [bsz, num_heads + 2 * kv_num_heads,
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
@@ -15,6 +15,70 @@
 #include "decoder_write_cache_with_rope_kernel.h"
 #include "utils.cuh"

+template <typename T, typename QKV_TYPE>
+void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv,
+                              T* key_cache,
+                              T* value_cache,
+                              T* qkv_out,
+                              const int* block_tables,
+                              const int* batch_id_per_token,
+                              const int* cu_seqlens_q,
+                              const int* seq_lens,
+                              const int* seq_lens_encoder,
+                              const float* cos_emb,
+                              const float* sin_emb,
+                              const float* qkv_out_scales,
+                              const T* qkv_biases,
+                              const int max_seq_len,
+                              const int max_blocks_per_seq,
+                              const int num_heads,
+                              const int kv_num_heads,
+                              const int dim_head,
+                              const int block_size,
+                              const int bsz,
+                              const cudaStream_t& stream,
+                              const bool use_neox_style,
+                              const bool rope_3d,
+                              const T* q_norm_weight,
+                              const T* k_norm_weight,
+                              const float rms_norm_eps) {
+  const uint32_t elem_nums =
+      use_neox_style ? bsz * (num_heads + 2 * kv_num_heads) * dim_head / 2
+                     : bsz * (num_heads + 2 * kv_num_heads) * dim_head;
+  assert(dim_head == 128 && "dim_head must be 128");
+  constexpr int HEAD_DIM = 128;
+
+  constexpr int PackSize = HEAD_DIM / kWarpSize;
+  const int pack_num = elem_nums / PackSize;
+  const int blocksize = 128;
+  int grid_size = 1;
+  GetNumBlocks<128>(pack_num, &grid_size);
+  dim3 block_dim(blocksize / kWarpSize, kWarpSize, 1);
+  append_decode_cache_T_rope_qk_norm_kernel<T, PackSize>
+      <<<grid_size, block_dim, 0, stream>>>(reinterpret_cast<const T*>(qkv),
+                                            key_cache,
+                                            value_cache,
+                                            qkv_out,
+                                            block_tables,
+                                            batch_id_per_token,
+                                            cu_seqlens_q,
+                                            seq_lens,
+                                            seq_lens_encoder,
+                                            cos_emb,
+                                            sin_emb,
+                                            max_seq_len,
+                                            max_blocks_per_seq,
+                                            num_heads,
+                                            dim_head,
+                                            block_size,
+                                            elem_nums,
+                                            kv_num_heads,
+                                            rope_3d,
+                                            q_norm_weight,
+                                            k_norm_weight,
+                                            rms_norm_eps);
+}
+
 template <typename T, typename QKV_TYPE>
 void append_decode_cache_rope(const QKV_TYPE* qkv,
                              T* key_cache,
@@ -441,7 +505,10 @@ void DecoderWriteCacheWithRoPEKernel(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out) {
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps) {
  typedef cascade_attn_type_traits<T> traits_;
  typedef cascade_attn_type_traits<QKV_TYPE> qkt_nv_type_;
  typedef typename traits_::type DataType_;
@@ -464,6 +531,43 @@ void DecoderWriteCacheWithRoPEKernel(
            ? rotary_embs.get().data<float>() + max_seq_len * dim_head
            : rotary_embs.get().data<float>() + max_seq_len * dim_head / 2;
  }
+
+  if (q_norm_weight && k_norm_weight) {
+    if (cache_quant_type_str == "none") {
+      append_decode_cache_rope_qk_norm(
+          reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
+          reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
+          reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
+          reinterpret_cast<DataType_*>(qkv_out->data<T>()),
+          block_tables.data<int>(),
+          batch_id_per_token.data<int>(),
+          cu_seqlens_q.data<int>(),
+          seq_lens.data<int>(),
+          seq_lens_encoder.data<int>(),
+          cos_emb,
+          sin_emb,
+          qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
+          qkv_biases ? reinterpret_cast<DataType_*>(
+                          const_cast<T*>(qkv_biases.get().data<T>()))
+                    : nullptr,
+          max_seq_len,
+          max_blocks_per_seq,
+          num_heads,
+          kv_num_heads,
+          dim_head,
+          block_size,
+          bsz,
+          stream,
+          use_neox_rotary_style,
+          rope_3d,
+          reinterpret_cast<const DataType_*>(q_norm_weight.get().data<T>()),
+          reinterpret_cast<const DataType_*>(k_norm_weight.get().data<T>()),
+          rms_norm_eps);
+    } else {
+      PD_THROW(
+          "append_decode_cache_rope_qk_norm not support cachekv quant yet");
+    }
+  } else {
    if (cache_quant_type_str == "none") {
      append_decode_cache_rope(
          reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
@@ -641,6 +745,7 @@ void DecoderWriteCacheWithRoPEKernel(
          "cache_int4_zp]");
    }
  }
+}


 template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
@@ -667,7 +772,10 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);

 template void
 DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
@@ -694,7 +802,10 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);

 template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
    const AppendAttnMetaData& meta_data,
@@ -720,7 +831,10 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);

 template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
    const AppendAttnMetaData& meta_data,
@@ -746,4 +860,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
@@ -40,4 +40,6 @@ void DecoderWriteCacheWithRoPEKernel(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
@@ -358,7 +358,7 @@ __global__ void GQAVariableLengthRotaryKernel(
       linear_index < elem_cnt;
       linear_index += step) {
    const int token_idx = linear_index / offset;
-    const int ori_bi = batch_id_per_token[token_idx];;
+    const int ori_bi = batch_id_per_token[token_idx];
    if (seq_lens[ori_bi] == 0) continue;
    const int bias = linear_index % offset;
    const int hi = bias / last_dim;
@@ -405,6 +405,94 @@ __global__ void GQAVariableLengthRotaryKernel(
  }
 }

+
+template <typename T, int VecSize = 1>
+__global__ void GQAVariableLengthRotaryQKNormKernel(
+    const T *qkv,
+    const float *cos_emb,
+    const float *sin_emb,
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
+    const int *seq_lens,
+    const int *seq_lens_decoder,
+    T *qkv_out,
+    const int64_t elem_cnt,
+    const int q_num_head,
+    const int kv_num_head,
+    const int seq_len,
+    const int last_dim,
+    const bool rope_3d,
+    const T* q_norm_weight,
+    const T* k_norm_weight,
+    const float rms_norm_eps
+) {
+  using LoadT = AlignedVector<T, VecSize>;
+  constexpr int HalfVecSize = VecSize / 2;
+  using LoadEmbT = AlignedVector<float, HalfVecSize>;
+  LoadT src_vec;
+  LoadEmbT cos_emb_vec;
+  LoadEmbT sin_emb_vec;
+  int64_t global_warp_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t all_warp_num = gridDim.x * blockDim.x;
+  const int half_lastdim = last_dim / 2;
+  const int offset = (q_num_head + kv_num_head) * last_dim;
+  const int all_head_num = elem_cnt / last_dim;
+  for (int gloabl_hi = global_warp_idx; gloabl_hi < all_head_num; gloabl_hi += all_warp_num) {
+    int64_t linear_index = gloabl_hi * last_dim + threadIdx.y * VecSize;
+    const int token_idx = linear_index / offset;
+    const int ori_bi = batch_id_per_token[token_idx];
+    if (seq_lens[ori_bi] == 0) continue;
+    const int bias = linear_index % offset;
+    const int hi = bias / last_dim;
+    const int h_bias = bias % last_dim;
+
+    const int ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
+    const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
+    const int64_t base_idx =
+        token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim +
+        h_bias;
+    Load<T, VecSize>(&qkv[base_idx], &src_vec);
+
+    int64_t new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
+    Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
+
+    float thread_m2 = 0.0f;
+    float warp_m2 = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < HalfVecSize; i++) {
+      const float input_left = static_cast<float>(src_vec[2 * i]);
+      const float input_right = static_cast<float>(src_vec[2 * i + 1]);
+      const float cos_tmp = cos_emb_vec[i];
+      const float sin_tmp = sin_emb_vec[i];
+      float tmp1 = input_left * cos_tmp - input_right * sin_tmp;
+      float tmp2 = input_right * cos_tmp + input_left * sin_tmp;
+      src_vec[2 * i] = static_cast<T>(tmp1);
+      src_vec[2 * i + 1] = static_cast<T>(tmp2);
+      thread_m2 += tmp1 * tmp1 + tmp2 * tmp2;
+    }
+    WelfordWarpAllReduce<float, 32>(thread_m2, &warp_m2);
+    float row_variance =
+        max(warp_m2 / last_dim, 0.0f);
+    float row_inv_var = Rsqrt(row_variance + rms_norm_eps);
+    LoadT q_norm_vec, k_norm_vec;
+    if (hi < q_num_head) {
+      Load<T, VecSize>(&q_norm_weight[threadIdx.y * VecSize], &q_norm_vec);
+      #pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        src_vec[i] = static_cast<T>(static_cast<float>(src_vec[i]) * row_inv_var * static_cast<float>(q_norm_vec[i]));
+      }
+    } else {
+      Load<T, VecSize>(&k_norm_weight[threadIdx.y * VecSize], &k_norm_vec);
+      for (int i = 0; i < VecSize; i++) {
+        src_vec[i] = static_cast<T>(static_cast<float>(src_vec[i]) * row_inv_var * static_cast<float>(k_norm_vec[i]));
+      }
+    }
+    Store<T, VecSize>(src_vec, &qkv_out[base_idx]);
+  }
+}
+
 template <typename T, int VecSize = 1>
 __global__ void GQAVariableLengthRotaryKernel(
    const T *qkv,
@@ -1568,6 +1656,66 @@ void rotary_qk_variable(
  }
 }

+template <typename T, typename QKV_TYPE>
+void gqa_rotary_qk_norm_variable(
+    T *qkv_out,                   // [token_num, 3, num_head, dim_head]
+    const QKV_TYPE *qkv_input,    // qkv
+    const float *qkv_out_scales,  // [3, num_head, dim_head]
+    const T *qkv_bias,
+    const float *rotary_emb,  // [2, 1, 1, seq_len, dim_head / 2]
+    const int *batch_id_per_token,
+    const int *cu_seqlens_q,
+    const int *seq_lens,
+    const int *seq_lens_decoder,
+    const int token_num,
+    const int num_heads,
+    const int kv_num_heads,
+    const int seq_len,
+    const int input_output_len,
+    const int dim_head,
+    const cudaStream_t &stream,
+    bool use_neox_style = false,
+    bool rope_3d = false,
+    const T *q_norm_weight = nullptr,
+    const T *k_norm_weight = nullptr,
+    const float rms_norm_eps = 1e-6) {
+  int64_t elem_nums =
+      qkv_out_scales
+          ? token_num * (num_heads + 2 * kv_num_heads) * dim_head
+          : token_num * (num_heads + kv_num_heads) * dim_head;  // for all q k v
+  assert(dim_head == 128 && "dim_head must be 128");
+  constexpr int HEAD_DIM = 128;
+  constexpr int PackSize = HEAD_DIM / kWarpSize;
+  const int pack_num = elem_nums / PackSize;
+  const int blocksize = 128;
+  int grid_size = 1;
+  GetNumBlocks<128>(pack_num, &grid_size);
+  dim3 Blocks(grid_size/kWarpSize, kWarpSize, 1);
+
+  const float *cos_emb = rotary_emb;
+  const float *sin_emb = rotary_emb + input_output_len * dim_head / 2;
+
+  GQAVariableLengthRotaryQKNormKernel<T, PackSize>
+      <<<grid_size, Blocks, 0, stream>>>(
+          reinterpret_cast<const T *>(qkv_input),
+          cos_emb,
+          sin_emb,
+          batch_id_per_token,
+          cu_seqlens_q,
+          seq_lens,
+          seq_lens_decoder,
+          qkv_out,
+          elem_nums,
+          num_heads,
+          kv_num_heads,
+          seq_len,
+          dim_head,
+          rope_3d,
+          q_norm_weight,
+          k_norm_weight,
+          rms_norm_eps);
+}
+
 template <typename T, typename QKV_TYPE>
 void gqa_rotary_qk_variable(
    T *qkv_out,                   // [token_num, 3, num_head, dim_head]
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
@@ -46,7 +46,10 @@ void EncoderWriteCacheWithRopeKernel(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out) {
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps) {
  auto token_num = meta_data.token_nums;
  auto num_heads = meta_data.q_num_heads;
  auto kv_num_heads = meta_data.kv_num_heads;
@@ -56,6 +59,35 @@ void EncoderWriteCacheWithRopeKernel(
    is_scale_channel_wise = true;
  }

+  if (q_norm_weight && k_norm_weight) {
+    if (num_heads != kv_num_heads && !is_scale_channel_wise && !use_neox_style) {
+      gqa_rotary_qk_norm_variable(
+        qkv_out->data<T>(),
+        qkv.data<QKV_TYPE>(),
+        qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
+        qkv_biases ? qkv_biases.get().data<T>() : nullptr,
+        rotary_embs.get().data<float>(),
+        batch_id_per_token.data<int>(),
+        cu_seqlens_q.data<int>(),
+        seq_lens_encoder.data<int>(),
+        seq_lens_decoder.data<int>(),
+        token_num,
+        num_heads,
+        kv_num_heads,
+        max_seq_len,
+        rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
+        head_dim,
+        stream,
+        use_neox_style,
+        rope_3d,
+        q_norm_weight ? q_norm_weight.get().data<T>() : nullptr,
+        k_norm_weight ? k_norm_weight.get().data<T>() : nullptr,
+        rms_norm_eps);
+    } else {
+      PD_THROW(
+          "gqa_rotary_qk_norm_variable only support gqa mode. channel wise scale and neox style are not supported");
+    }
+  } else {
    if (num_heads == kv_num_heads) {
      rotary_qk_variable(
          qkv_out->data<T>(),
@@ -121,6 +153,7 @@ void EncoderWriteCacheWithRopeKernel(
      }

    }
+  }
  const uint32_t block_size = meta_data.block_size;
  if (cache_quant_type_str == "none") {
    CascadeAppendWriteCacheKVQKV<T>(meta_data,
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
@@ -43,4 +43,7 @@ EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
    cudaStream_t& stream,
    paddle::Tensor* qkv_out,
    paddle::Tensor* key_cache_out,
-    paddle::Tensor* value_cache_out);
+    paddle::Tensor* value_cache_out,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps);
--- a/custom_ops/gpu_ops/append_attn/utils.cuh
+++ b/custom_ops/gpu_ops/append_attn/utils.cuh
@@ -559,3 +559,37 @@ template <typename T, bool IsFP8>inline __device__ static void convert_c8(T * re
    convert_int8(result, source);
  }
 }
+
+constexpr int kWarpSize = 32;
+
+template<typename T>
+inline __device__ void WelfordCombine1(T b_m2, T* m2) {
+  *m2 += b_m2;
+}
+
+template<typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(T thread_m2, T* m2) {
+  *m2 = thread_m2;
+  for (int mask = thread_group_width / 2; mask > 0; mask >>= 1) {
+    T b_m2 = __shfl_xor_sync(0xffffffff, *m2, mask);
+    WelfordCombine1(b_m2, m2);
+  }
+}
+
+template<typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpAllReduce(T thread_m2, T* m2) {
+  WelfordWarpReduce<T, thread_group_width>(thread_m2, m2);
+}
+
+template <typename T>
+__inline__ __device__ T Rsqrt(T x);
+
+template <>
+__inline__ __device__ float Rsqrt<float>(float x) {
+  return rsqrt(x);
+}
+
+template <>
+__inline__ __device__ double Rsqrt<double>(double x) {
+  return rsqrt(x);
+}
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -78,6 +78,9 @@ std::vector<paddle::Tensor> AppendAttention(
    const paddle::optional<paddle::Tensor> &out_linear_shifts,
    const paddle::optional<paddle::Tensor> &out_linear_smooths,
    const paddle::optional<paddle::Tensor> &kv_signal_data,
+    const paddle::optional<paddle::Tensor>& q_norm_weight,
+    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const float rms_norm_eps,
    const std::string &compute_dtype, const std::string &cache_quant_type_str,
    const bool use_neox_rotary_style, const bool rope_3d,
    const int max_input_length, const float quant_max_bound,
--- a/custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu
+++ b/custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu
@@ -43,6 +43,11 @@
      __VA_ARGS__                                                                       \
      break;                                                                            \
    }                                                                                   \
+    case 48: {                                                 \
+        constexpr size_t NUM_EXPERTS_PER_RANK = 48;            \
+        __VA_ARGS__                                            \
+        break;                                                 \
+    }   \
    case 64: {                                                                          \
      constexpr size_t NUM_EXPERTS_PER_RANK = 64;                                       \
      __VA_ARGS__                                                                       \
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -262,6 +262,9 @@ class AppendAttentionBackend(AttentionBackend):
            layer.linear_shift,
            layer.linear_smooth,
            metadata.kv_signal_data_list[layer.layer_id],
+            getattr(layer, "q_norm_weight", None),
+            getattr(layer, "k_norm_weight", None),
+            getattr(layer, "rms_norm_eps", 1e-6),
            metadata._fuse_kernel_compute_dtype,
            getattr(layer, "cache_quant_type_str", "none"),
            layer.use_neox_rotary_style,
--- a/fastdeploy/model_executor/layers/attention/attention.py
+++ b/fastdeploy/model_executor/layers/attention/attention.py
@@ -28,6 +28,7 @@ from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethod

 if TYPE_CHECKING:
    from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.layers.utils import get_tensor


 class Attention(nn.Layer):
@@ -49,6 +50,7 @@ class Attention(nn.Layer):
        linear_smooth: paddle.Tensor = None,
        use_neox_rotary_style: bool = False,
        use_qk_norm: bool = False,
+        rms_norm_eps: float = 1e-6,
    ) -> None:
        """
        Initializes `LMLayer` with the given parameters.
@@ -63,6 +65,8 @@ class Attention(nn.Layer):
            prefix (str, optional): The name of current layer. Defaults to "".
            linear_shift (Optional[paddle.Tensor], optional): The shift of linear. Defaults to None.
            linear_smooth (Optional[paddle.Tensor], optional): The smooth of linear. Defaults to None.
+            use_qk_norm (bool, optional): Whether to apply rmsnorm on QA after rope. Defaults to False.
+            rms_norm_eps (float, optional): The epsilon of RMSNorm. Defaults to 1e-6.

        Raises:
            ValueError: If the `v_head_dim` is less than 0.
@@ -102,6 +106,27 @@ class Attention(nn.Layer):
            logger.info(
                f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode"
            )
+        self.use_qk_norm = use_qk_norm
+        self.rms_norm_eps = rms_norm_eps
+        if self.use_qk_norm:
+            self.q_norm_key = f"{self.prefix}.q_norm"
+            self.k_norm_key = f"{self.prefix}.k_norm"
+            self.init_weight()
+
+    def init_weight(self):
+        self.q_norm_weight = self.create_parameter(
+            shape=[self.qk_head_dim],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        self.k_norm_weight = self.create_parameter(
+            shape=[self.qk_head_dim],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )

    def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]):
        """
@@ -109,6 +134,11 @@ class Attention(nn.Layer):
        """
        if self.kvcache_quant_method is not None:
            self.kvcache_quant_method.create_weights(self, state_dict)
+        if self.use_qk_norm:
+            q_norm_weight_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.q_norm_key + ".weight")))
+            k_norm_weight_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.k_norm_key + ".weight")))
+            self.q_norm_weight.set_value(q_norm_weight_tensor)
+            self.k_norm_weight.set_value(k_norm_weight_tensor)

    def forward(
        self,
--- a/fastdeploy/model_executor/layers/attention/ops/append_attention.py
+++ b/fastdeploy/model_executor/layers/attention/ops/append_attention.py
@@ -60,6 +60,9 @@ def append_attention(
    linear_shift: Optional[paddle.Tensor] = None,
    linear_smooth: Optional[paddle.Tensor] = None,
    kv_signal_data: Optional[paddle.Tensor] = None,
+    q_norm_weight: Optional[paddle.Tensor] = None,
+    k_norm_weight: Optional[paddle.Tensor] = None,
+    rms_norm_eps: float = 1e-6,
    compute_type: str = "bf16",
    cache_quant_type: str = "none",
    use_neox_rotary_style: bool = False,
@@ -114,6 +117,9 @@ def append_attention(
            linear_shift,
            linear_smooth,
            kv_signal_data,
+            q_norm_weight,
+            k_norm_weight,
+            rms_norm_eps,
            compute_type,
            cache_quant_type,
            use_neox_rotary_style,
--- a/test/layers/test_append_attention.py
+++ b/test/layers/test_append_attention.py
@@ -17,6 +17,7 @@ import unittest

 import numpy as np
 import paddle
+from paddle.incubate.nn.functional import fused_rms_norm

 paddle.seed(10)

@@ -157,6 +158,8 @@ def naive_attention_impl(
    cache_k_dequant_scales=None,
    cache_v_dequant_scales=None,
    use_cachekv_int8="None",
+    q_norm_weight=None,
+    k_norm_weight=None,
 ):
    batch = query.shape[0]
    heads = query.shape[1]
@@ -244,6 +247,27 @@ def get_qkv_and_qkv_concat_tensor(bs, q_num_head, kv_num_head, seq_len, dim_head
    return q, k, v, qkv


+def apply_qk_norm(head_dim, dtype, q, k):
+    q_norm_weight = np.random.random([head_dim]) / 10
+    k_norm_weight = np.random.random([head_dim]) / 10
+    q_norm_weight_tensor = paddle.to_tensor(q_norm_weight, dtype=dtype)
+    k_norm_weight_tensor = paddle.to_tensor(k_norm_weight, dtype=dtype)
+    print("q:", q.shape)
+    print("k:", k.shape)
+    bs, q_num_head, seq_len, dim_head = q.shape
+    _, kv_num_head, _, _ = k.shape
+
+    q = q.reshape([-1, head_dim])
+    k = k.reshape([-1, head_dim])
+    print("q:", q)
+    q = fused_rms_norm(q, q_norm_weight_tensor, None, 1e-5)[0]
+    print("q after norm:", q)
+    k = fused_rms_norm(k, k_norm_weight_tensor, None, 1e-5)[0]
+    q = q.reshape([-1, q_num_head, seq_len, dim_head])
+    k = k.reshape([-1, kv_num_head, seq_len, dim_head])
+    return q, k, q_norm_weight_tensor, k_norm_weight_tensor
+
+
 def split_query_by_phase(
    query,
    seq_lens_encoder,
@@ -324,6 +348,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
        self.softmax_scale = self.dim_head**-0.5
        self.rope_theta = 10000
        self.dtype = "float16"
+        self.use_qk_norm = True
        self.init_tensor()

    def init_tensor(self):
@@ -394,6 +419,11 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
        )

        q, k = self.rope._apply_rope(self.rope_emb, q, k, causal=True)
+        if self.use_qk_norm:
+            q, k, q_norm_weight, k_norm_weight = apply_qk_norm(self.dim_head, self.dtype, q, k)
+        else:
+            q_norm_weight = None
+            k_norm_weight = None
        out_ = naive_attention_impl(
            q,
            k,
@@ -476,6 +506,9 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase):
                None,  # linear_shift
                None,  # linear_smooth
                None,  # kv_signal_data
+                q_norm_weight,  # q_norm_weight
+                k_norm_weight,  # k_norm_weight
+                1e-6,
                "fp16",
                "none",  # cache_quant_type
                self.use_neox_rotary_style,
@@ -580,6 +613,7 @@ class TestAppendGroupQueryAttnWithNeoXRope(TestAppendGroupQueryAttnWithRope):
        self.softmax_scale = self.dim_head**-0.5
        self.rope_theta = 10000
        self.dtype = "float16"
+        self.use_qk_norm = False
        self.init_tensor()