[Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651)

2025-10-05 16:48:03 +08:00 · 2025-09-22 21:13:59 +08:00
parent 5532e8a323
commit 504461b6b5
17 changed files with 1344 additions and 363 deletions
--- a/.github/workflows/ci_iluvatar.yml
+++ b/.github/workflows/ci_iluvatar.yml
@@ -28,18 +28,22 @@ jobs:
          REPO="https://github.com/${{ github.repository }}.git"
          FULL_REPO="${{ github.repository }}"
          REPO_NAME="${FULL_REPO##*/}"
          BASE_BRANCH="${{ github.base_ref }}"
          # Clean the repository directory before starting
          docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
          -e "REPO_NAME=${REPO_NAME}" \
          -e "BASE_BRANCH=${BASE_BRANCH}" \
          ${docker_image} /bin/bash -c '
            if [ -d ${REPO_NAME} ]; then
              echo "Directory ${REPO_NAME} exists, removing it..."
              rm -rf ${REPO_NAME}
            fi
          '
          git config --global http.proxy "http://61.151.249.150:33128"
          git config --global https.proxy "http://61.151.249.150:33128"
          git config --global user.name "FastDeployCI"
          git config --global user.email "fastdeploy_ci@example.com"
-          git clone ${REPO} ${REPO_NAME}
+          git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
          cd FastDeploy
          if [ "${{ github.event_name }}" = "pull_request" ]; then
            git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
--- a/custom_ops/gpu_ops/helper.h
+++ b/custom_ops/gpu_ops/helper.h
@@ -193,11 +193,13 @@ public:
  typedef uint8_t data_t;
 };
 #ifndef PADDLE_WITH_COREX
 template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
 public:
  typedef __nv_fp8_e4m3 DataType;
  typedef paddle::float8_e4m3fn data_t;
 };
 #endif
 template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
  T val[Size];
--- a/custom_ops/iluvatar_ops/mixed_fused_attn.cu
+++ b/custom_ops/iluvatar_ops/mixed_fused_attn.cu
@@ -0,0 +1,376 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "helper.h"
 #include "iluvatar_context.h"
 template <paddle::DataType T>
 void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv,
                               paddle::Tensor& k_cache,
                               paddle::Tensor& v_cache,
                               const paddle::Tensor& prefill_block_table,
                               const paddle::Tensor& decode_block_table,
                               const paddle::Tensor& cu_seqlens_qkv,
                               const paddle::Tensor& seq_lens,
                               const paddle::optional<paddle::Tensor> &rope_sin,
                               const paddle::optional<paddle::Tensor> &rope_cos,
                               int prefill_num_tokens,
                               int num_heads,
                               int head_dim,
                               int num_kv_heads,
                               int block_size,
                               int max_seq_len,
                               float scale,
                               bool causal,
                               bool q_rope,
                               bool k_rope,
                               bool v_rope,
                               int window_left,
                               int window_right,
                               float softcap,
                               bool enable_cuda_graph,
                               bool use_sqrt_alibi,
                               paddle::Tensor& out) {
    typedef PDTraits<T> traits_;
    typedef typename traits_::data_t data_t;
    const auto& dtype = qkv.dtype();
    cuinferDataType_t cuinfer_data_type;
    cudaDataType_t cu_data_type;
    if (dtype == paddle::DataType::FLOAT16) {
      cuinfer_data_type = CUINFER_DATA_HALF;
      cu_data_type = CUDA_R_16F;
    } else {
      cuinfer_data_type = CUINFER_DATA_BFLOAT16;
      cu_data_type = CUDA_R_16BF;
    }
    const auto& qkv_dims = qkv.dims();
    const auto& kv_cache_dims = k_cache.dims();
    const auto& prefill_block_table_dims = prefill_block_table.dims();
    const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
    int prefill_batch_size = prefill_block_table_dims[0];
    int num_tokens = qkv_dims[0];
    int decode_num_tokens = num_tokens - prefill_num_tokens;
    int num_total_heads = num_heads + 2 * num_kv_heads;
    int max_num_blocks_per_seq = prefill_block_table_dims[1];
    int qkv_stride = qkv.strides()[0];
    int num_blocks = kv_cache_dims[0];
    int kv_block_stride = k_cache.strides()[0];
    int kv_head_stride = k_cache.strides()[1];
    int block_table_stride = prefill_block_table.strides()[0];
    const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
    const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
    cuinferTensorDescriptor_t qkv_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      qkv_desc,
      cuinfer_data_type,
      3,
      std::vector<int>({prefill_num_tokens, num_total_heads, head_dim}).data(),
      std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t qkv_seqlens_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      qkv_seqlens_desc,
      CUINFER_DATA_INT32,
      1,
      std::vector<int>({prefill_batch_size + 1}).data(),
      std::vector<int>({1}).data()));
    cuinferTensorDescriptor_t block_table_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      block_table_desc,
      CUINFER_DATA_INT32,
      2,
      std::vector<int>({prefill_batch_size, block_table_stride}).data(),
      std::vector<int>({block_table_stride, 1}).data()));
    cuinferTensorDescriptor_t o_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      o_desc,
      cuinfer_data_type,
      3,
      std::vector<int>({prefill_num_tokens, num_heads, head_dim}).data(),
      std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t k_cache_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      k_cache_desc,
      cuinfer_data_type,
      4,
      std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
      std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t v_cache_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      v_cache_desc,
      cuinfer_data_type,
      4,
      std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
      std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t cos_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      cos_desc,
      CUINFER_DATA_FLOAT,
      2,
      std::vector<int>({max_seq_len, head_dim}).data(),
      std::vector<int>({head_dim, 1}).data()));
    cuinferTensorDescriptor_t sin_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      sin_desc,
      CUINFER_DATA_FLOAT,
      2,
      std::vector<int>({max_seq_len, head_dim}).data(),
      std::vector<int>({head_dim, 1}).data()));
    cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
    size_t prefill_workspace_size = 0;
    CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(prefill_num_tokens,
                                                               num_heads,
                                                               num_kv_heads,
                                                               head_dim,
                                                               q_rope,
                                                               k_rope,
                                                               v_rope,
                                                               cuinfer_data_type,
                                                               cuinfer_data_type,
                                                               cuinfer_data_type,
                                                               &prefill_workspace_size));
    auto* allocator = paddle::GetAllocator(qkv.place());
    phi::Allocator::AllocationPtr prefill_tmp_workspace = allocator->Allocate(prefill_workspace_size);
    void* prefill_workspace_ptr = prefill_tmp_workspace->ptr();
    CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
                                                   qkv_desc,
                                                   qkv.data(),
                                                   qkv_seqlens_desc,
                                                   cu_seqlens_qkv.data<int32_t>(),
                                                   block_table_desc,
                                                   prefill_block_table.data<int32_t>(),
                                                   o_desc,
                                                   out.data(),
                                                   k_cache_desc,
                                                   k_cache.data(),
                                                   v_cache_desc,
                                                   v_cache.data(),
                                                   prefill_workspace_ptr,
                                                   prefill_workspace_size,
                                                   cos_desc,
                                                   rope_cos_ptr,
                                                   sin_desc,
                                                   rope_sin_ptr,
                                                   prefill_batch_size,
                                                   num_heads,
                                                   num_kv_heads,
                                                   head_dim,
                                                   causal,
                                                   scale,
                                                   q_rope,
                                                   k_rope,
                                                   v_rope));
    size_t decode_workspace_size = 0;
    CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens,
                                                     num_heads,
                                                     num_kv_heads,
                                                     head_dim,
                                                     block_size,
                                                     max_seq_len,
                                                     &decode_workspace_size));
    phi::Allocator::AllocationPtr decode_tmp_workspace = allocator->Allocate(decode_workspace_size);
    void* decode_workspace_ptr = decode_tmp_workspace->ptr();
    void* decode_qkv_ptr = (void*)(qkv.data<data_t>() + prefill_num_tokens * qkv_stride);
    void* decode_out_ptr = (void*)(out.data<data_t>() + prefill_num_tokens * out.strides()[0]);
    PageAttentionWithKVCacheArguments args{
            static_cast<float>(scale), 1.0, 1.0, static_cast<float>(softcap), window_left, window_right,
            causal, use_sqrt_alibi, enable_cuda_graph, false, nullptr, decode_qkv_ptr, decode_qkv_ptr,
            decode_workspace_ptr, true, rope_sin_ptr, rope_cos_ptr};
    CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle,
                                         decode_out_ptr,
                                         cu_data_type,
                                         decode_qkv_ptr,
                                         cu_data_type,
                                         decode_num_tokens,
                                         num_heads,
                                         num_kv_heads,
                                         head_dim,
                                         qkv_stride,
                                         kv_block_stride,
                                         kv_head_stride,
                                         k_cache.data(),
                                         cu_data_type,
                                         v_cache.data(),
                                         cu_data_type,
                                         block_size,
                                         max_num_blocks_per_seq,
                                         max_seq_len,
                                         decode_block_table.data<int32_t>(),
                                         seq_lens.data<int32_t>(),
                                         args));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
 }
 std::vector<paddle::Tensor> MixedFusedPagedAttn(const paddle::Tensor& qkv,
                                                paddle::Tensor& k_cache,
                                                paddle::Tensor& v_cache,
                                                const paddle::Tensor& prefill_block_table,
                                                const paddle::Tensor& decode_block_table,
                                                const paddle::Tensor& cu_seqlens_qkv,
                                                const paddle::Tensor& seq_lens,
                                                const paddle::optional<paddle::Tensor> &rope_sin,
                                                const paddle::optional<paddle::Tensor> &rope_cos,
                                                int prefill_num_tokens,
                                                int num_heads,
                                                int head_dim,
                                                int num_kv_heads,
                                                int block_size,
                                                int max_seq_len,
                                                float scale,
                                                bool causal,
                                                bool q_rope,
                                                bool k_rope,
                                                bool v_rope,
                                                int window_left,
                                                int window_right,
                                                float softcap,
                                                bool enable_cuda_graph,
                                                bool use_sqrt_alibi) {
    const auto dtype = qkv.dtype();
    auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
    switch (dtype) {
        case paddle::DataType::BFLOAT16:
            MixedFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
                                                                  k_cache,
                                                                  v_cache,
                                                                  prefill_block_table,
                                                                  decode_block_table,
                                                                  cu_seqlens_qkv,
                                                                  seq_lens,
                                                                  rope_sin,
                                                                  rope_cos,
                                                                  prefill_num_tokens,
                                                                  num_heads,
                                                                  head_dim,
                                                                  num_kv_heads,
                                                                  block_size,
                                                                  max_seq_len,
                                                                  scale,
                                                                  causal,
                                                                  q_rope,
                                                                  k_rope,
                                                                  v_rope,
                                                                  window_left,
                                                                  window_right,
                                                                  softcap,
                                                                  enable_cuda_graph,
                                                                  use_sqrt_alibi,
                                                                  out);
            break;
        case paddle::DataType::FLOAT16:
            MixedFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
                                                                 k_cache,
                                                                 v_cache,
                                                                 prefill_block_table,
                                                                 decode_block_table,
                                                                 cu_seqlens_qkv,
                                                                 seq_lens,
                                                                 rope_sin,
                                                                 rope_cos,
                                                                 prefill_num_tokens,
                                                                 num_heads,
                                                                 head_dim,
                                                                 num_kv_heads,
                                                                 block_size,
                                                                 max_seq_len,
                                                                 scale,
                                                                 causal,
                                                                 q_rope,
                                                                 k_rope,
                                                                 v_rope,
                                                                 window_left,
                                                                 window_right,
                                                                 softcap,
                                                                 enable_cuda_graph,
                                                                 use_sqrt_alibi,
                                                                 out);
            break;
        default:
            PD_THROW("Unsupported data type for mixed paged attn");
    }
    return {out};
 }
 std::vector<std::vector<int64_t>> MixedFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
                                                                int num_heads,
                                                                int head_dim) {
    return {{qkv_shape[0], num_heads * head_dim}};
 }
 std::vector<paddle::DataType> MixedFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
    return {qkv_dtype};
 }
 PD_BUILD_STATIC_OP(mixed_fused_paged_attn)
    .Inputs({"qkv", "k_cache", "v_cache", "prefill_block_table", "decode_block_table",
             "cu_seqlens_qkv", "seq_lens",  paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
    .Outputs({"out"})
    .Attrs({"prefill_num_tokens:int",
            "num_heads: int",
            "head_dim:int",
            "num_kv_heads:int",
            "block_size:int",
            "max_seq_len:int",
            "scale:float",
 	          "causal:bool",
 	          "q_rope:bool",
            "k_rope:bool",
            "v_rope:bool",
            "window_left:int",
            "window_right:int",
            "softcap:float",
 	          "enable_cuda_graph:bool",
            "use_sqrt_alibi:bool"})
    .SetKernelFn(PD_KERNEL(MixedFusedPagedAttn))
    .SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype));
--- a/custom_ops/iluvatar_ops/moe_dispatch.cu
+++ b/custom_ops/iluvatar_ops/moe_dispatch.cu
@@ -53,6 +53,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
                       const paddle::optional<paddle::Tensor>& gating_correction_bias,
                       const int moe_topk,
                       const bool group_moe,
                       const std::string &moe_quant_type,
                       const bool topk_only_mode,
                       const int num_rows,
                       const int hidden_size,
@@ -183,6 +184,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
    const paddle::optional<paddle::Tensor>& w4a8_in_scale,
    const int moe_topk,
    const bool group_moe,
    const std::string &moe_quant_type,
    const bool topk_only_mode) {
  const auto input_type = input.dtype();
  auto place = input.place();
@@ -220,6 +222,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
                                                    gating_correction_bias,
                                                    moe_topk,
                                                    group_moe,
                                                    moe_quant_type,
                                                    topk_only_mode,
                                                    num_rows,
                                                    hidden_size,
@@ -236,6 +239,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
                                                   gating_correction_bias,
                                                   moe_topk,
                                                   group_moe,
                                                   moe_quant_type,
                                                   topk_only_mode,
                                                   num_rows,
                                                   hidden_size,
@@ -305,7 +309,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
              "top_k_weight",
              "top_k_indices",
              "expert_idx_per_token"})
-    .Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
+    .Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
    .SetKernelFn(PD_KERNEL(MoeExpertDispatch))
    .SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
--- a/custom_ops/iluvatar_ops/paged_attn.cu
+++ b/custom_ops/iluvatar_ops/paged_attn.cu
@@ -27,6 +27,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
                     const paddle::optional<paddle::Tensor> &v,
                     const paddle::optional<paddle::Tensor> &rope_sin,
                     const paddle::optional<paddle::Tensor> &rope_cos,
                     int num_heads,
                     int head_dim,
                     int num_kv_heads,
                     float scale,
                     int block_size,
@@ -86,32 +88,36 @@ void PagedAttnKernel(const paddle::Tensor& q,
                      common::errors::InvalidArgument(
                          "paged_attention expects seq_lens is contiguous"));
    // check dim and shape
-    // k_cache: [num_blocks, kv_num_heads, block_size, head_size]
+    // k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
-    // v_cache: [num_blocks, kv_num_heads, block_size, head_size]
+    // v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
    // block_table: [num_seqs, max_num_blocks_per_seq]
    // seq_lens: [num_seqs]
    // q and out:
-    // merged_qkv = false: [num_seqs, num_heads, head_size]
+    // if merged_qkv = false:
-    // merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size]
+    // q:[num_seqs, hidden_size]
    // out:[num_seqs, hidden_size]
    // if merged_qkv = true:
    // q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
    // out: [num_seqs, hidden_size]
    const auto& q_dims = q.dims();
    PADDLE_ENFORCE_EQ(q_dims.size(),
-                      3,
+                      2,
                      common::errors::InvalidArgument(
                          "paged_attn receive query dims is "
-                          "[num_seqs, num_heads, head_size]"));
+                          "[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
    PADDLE_ENFORCE_EQ(out.dims().size(),
-                      3,
+                      2,
                      common::errors::InvalidArgument(
                          "paged_attn receive out dims is "
-                          "[num_seqs, num_heads, head_size]"));
+                          "[num_seqs, hidden_size]"));
    const auto& kv_cache_dims = k_cache.dims();
    PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
                      4,
                      common::errors::InvalidArgument(
                          "paged_attn receive kv cache dims is "
-                          "[num_blocks, kv_num_heads, block_size, head_size]"));
+                          "[num_blocks, kv_num_heads, block_size, head_dim]"));
    const auto& block_table_dims = block_table.dims();
    PADDLE_ENFORCE_EQ(block_table_dims.size(),
@@ -127,8 +133,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
                          "paged_attn receive seq_lens dims is [num_seqs]"));
    int num_seqs = q_dims[0];
    int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
    int head_size = q_dims[2];
    int max_num_blocks_per_seq = block_table_dims[1];
    int q_stride = q.strides()[0];
    int num_blocks = kv_cache_dims[0];
@@ -142,9 +146,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
                      common::errors::InvalidArgument(
                          "kv_cache_dims[2] must be equal to block_size"));
    PADDLE_ENFORCE_EQ(kv_cache_dims[3],
-                      head_size,
+                      head_dim,
                      common::errors::InvalidArgument(
-                          "kv_cache_dims[3] must be equal to head_size"));
+                          "kv_cache_dims[3] must be equal to head_dim"));
    PADDLE_ENFORCE_EQ(block_table_dims[0],
                      num_seqs,
                      common::errors::InvalidArgument(
@@ -162,14 +166,13 @@ void PagedAttnKernel(const paddle::Tensor& q,
    const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
    const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
    cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
    size_t workspace_size = 0;
    CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
                                                     num_heads,
                                                     num_kv_heads,
-                                                     head_size,
+                                                     head_dim,
                                                     block_size,
                                                     max_context_len,
                                                     &workspace_size));
@@ -189,7 +192,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
                                         num_seqs,
                                         num_heads,
                                         num_kv_heads,
-                                         head_size,
+                                         head_dim,
                                         q_stride,
                                         kv_block_stride,
                                         kv_head_stride,
@@ -215,6 +218,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                      const paddle::optional<paddle::Tensor> &v,
                                      const paddle::optional<paddle::Tensor> &rope_sin,
                                      const paddle::optional<paddle::Tensor> &rope_cos,
                                      int num_heads,
                                      int head_dim,
                                      int num_kv_heads,
                                      float scale,
                                      int block_size,
@@ -228,11 +233,7 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                      bool merged_qkv) {
    const auto dtype = q.dtype();
-    auto out_shape = q.shape();
+    auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
    if (merged_qkv) {
        out_shape[1] -=  2 * num_kv_heads;
    }
    auto out = paddle::empty(out_shape, dtype, q.place());
    switch (dtype) {
        case paddle::DataType::BFLOAT16:
@@ -246,6 +247,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                                        v,
                                                        rope_sin,
                                                        rope_cos,
                                                        num_heads,
                                                        head_dim,
 						                                num_kv_heads,
                                                        scale,
                                                        block_size,
@@ -270,6 +273,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                                       v,
                                                       rope_sin,
                                                       rope_cos,
                                                       num_heads,
                                                       head_dim,
 						                               num_kv_heads,
                                                       scale,
                                                       block_size,
@@ -299,6 +304,8 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
                                                      const std::vector<int64_t>& v_shape,
                                                      const std::vector<int64_t>& rope_sin_shape,
                                                      const std::vector<int64_t>& rope_cos_shape,
                                                      int num_heads,
                                                      int head_dim,
                                                      int num_kv_heads,
                                                      float scale,
                                                      int block_size,
@@ -311,36 +318,13 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
                                                      bool use_sqrt_alibi,
                                                      bool merged_qkv) {
    if (merged_qkv) {
-        int64_t num_tokens = q_shape[0];
+        return {{q_shape[0], num_heads * head_dim}};
        int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
        int64_t head_dim = q_shape[2];
        return {{num_tokens, num_heads, head_dim}};
    } else {
        return {q_shape};
    }
 }
-std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype,
+std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
                                                  const paddle::DataType& k_cache_dtype,
                                                  const paddle::DataType& v_cache_dtype,
                                                  const paddle::DataType& block_table_dtype,
                                                  const paddle::DataType& seq_lens_dtype,
                                                  const paddle::DataType& alibi_slopes_dtype,
                                                  const paddle::DataType& k_dtype,
                                                  const paddle::DataType& v_dtype,
                                                  const paddle::DataType& rope_sin_dtype,
                                                  const paddle::DataType& rope_cos_dtype,
                                                  int num_kv_heads,
                                                  float scale,
                                                  int block_size,
                                                  int max_context_len,
                                                  bool causal,
                                                  int window_left,
                                                  int window_right,
                                                  float softcap,
                                                  bool enable_cuda_graph,
                                                  bool use_sqrt_alibi,
                                                  bool merged_qkv) {
    return {q_dtype};
 }
@@ -351,7 +335,9 @@ PD_BUILD_STATIC_OP(paged_attn)
             paddle::Optional("v"), paddle::Optional("rope_sin"),
             paddle::Optional("rope_cos")})
    .Outputs({"out"})
-    .Attrs({"num_kv_heads:int",
+    .Attrs({"num_heads:int",
            "head_dim:int",
            "num_kv_heads:int",
            "scale:float",
            "block_size:int",
            "max_context_len:int",
--- a/custom_ops/iluvatar_ops/prefill_fused_attn.cu
+++ b/custom_ops/iluvatar_ops/prefill_fused_attn.cu
@@ -0,0 +1,378 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "helper.h"
 #include "iluvatar_context.h"
 template <paddle::DataType T>
 void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv,
                                 paddle::Tensor& k_cache,
                                 paddle::Tensor& v_cache,
                                 const paddle::Tensor& block_table,
                                 const paddle::Tensor& cu_seqlens_qkv,
                                 const paddle::optional<paddle::Tensor> &rope_sin,
                                 const paddle::optional<paddle::Tensor> &rope_cos,
                                 int num_heads,
                                 int head_dim,
                                 int num_kv_heads,
                                 int block_size,
                                 int max_seq_len,
                                 float scale,
                                 bool causal,
                                 bool q_rope,
                                 bool k_rope,
                                 bool v_rope,
                                 paddle::Tensor& out) {
    // check dtype and contiguous
    const auto& dtype = qkv.dtype();
    cuinferDataType_t data_type;
    if (dtype == paddle::DataType::FLOAT16) {
      data_type = CUINFER_DATA_HALF;
    } else if (dtype == paddle::DataType::BFLOAT16) {
      data_type = CUINFER_DATA_BFLOAT16;
    } else {
      common::errors::InvalidArgument("paged_attention support half and bfloat16 now");
    }
    PADDLE_ENFORCE_EQ(k_cache.dtype(),
                      dtype,
                      common::errors::InvalidArgument(
                          "k_cache dtype must be the same as query dtype"));
    PADDLE_ENFORCE_EQ(k_cache.is_contiguous(),
                      true,
                      common::errors::InvalidArgument(
                          "paged_attention expects k_cache is contiguous"));
    PADDLE_ENFORCE_EQ(block_table.dtype(),
                      paddle::DataType::INT32,
                      common::errors::InvalidArgument(
                          "block_table dtype must be int32"));
    PADDLE_ENFORCE_EQ(block_table.is_contiguous(),
                      true,
                      common::errors::InvalidArgument(
                          "paged_attention expects block_table is contiguous"));
    PADDLE_ENFORCE_EQ(cu_seqlens_qkv.dtype(),
                      paddle::DataType::INT32,
                      common::errors::InvalidArgument(
                          "cu_seqlens_qkv dtype must be int32"));
    PADDLE_ENFORCE_EQ(cu_seqlens_qkv.is_contiguous(),
                      true,
                      common::errors::InvalidArgument(
                          "paged_attention expects cu_seqlens_qkv is contiguous"));
    // check dim and shape
    // k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
    // v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
    // block_table: [batch_size, max_num_blocks_per_seq]
    // seq_lens: [batch_size]
    // qkv: [num_tokens, (num_heads+2*num_kv_heads)*head_dim]
    // out: [num_tokens, hidden_size]
    const auto& qkv_dims = qkv.dims();
    PADDLE_ENFORCE_EQ(qkv_dims.size(),
                      2,
                      common::errors::InvalidArgument(
                          "paged_attn receive query dims is "
                          "[num_tokens, (num_heads+2*num_kv_heads)*head_dim]"));
    PADDLE_ENFORCE_EQ(out.dims().size(),
                      2,
                      common::errors::InvalidArgument(
                          "paged_attn receive out dims is "
                          "[num_tokens, hidden_size]"));
    const auto& kv_cache_dims = k_cache.dims();
    PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
                      4,
                      common::errors::InvalidArgument(
                          "paged_attn receive kv cache dims is "
                          "[num_blocks, kv_num_heads, block_size, head_dim]"));
    const auto& block_table_dims = block_table.dims();
    PADDLE_ENFORCE_EQ(block_table_dims.size(),
                      2,
                      common::errors::InvalidArgument(
                          "paged_attn receive block_table dims is "
                          "[batch_size, max_num_blocks_per_seq]"));
    const auto& cu_seqlens_qkv_dims = cu_seqlens_qkv.dims();
    PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims.size(),
                      1,
                      common::errors::InvalidArgument(
                          "paged_attn receive cu_seqlens_qkv dims is [batch_size]"));
    int batch_size = block_table_dims[0];
    int num_tokens = qkv_dims[0];
    int num_total_heads = num_heads + 2 * num_kv_heads;
    int qkv_stride = qkv.strides()[0];
    int num_blocks = kv_cache_dims[0];
    PADDLE_ENFORCE_EQ(kv_cache_dims[1],
                      num_kv_heads,
                      common::errors::InvalidArgument(
                          "kv_cache_dims[1] must be equal to num_kv_head"));
    PADDLE_ENFORCE_EQ(kv_cache_dims[2],
                      block_size,
                      common::errors::InvalidArgument(
                          "kv_cache_dims[2] must be equal to block_size"));
    PADDLE_ENFORCE_EQ(kv_cache_dims[3],
                      head_dim,
                      common::errors::InvalidArgument(
                          "kv_cache_dims[3] must be equal to head_dim"));
    PADDLE_ENFORCE_EQ(cu_seqlens_qkv_dims[0],
                      batch_size + 1,
                      common::errors::InvalidArgument(
                          "cu_seqlens_qkv_dims[0] must be equal to batch_size + 1"));
    int block_table_stride = block_table.strides()[0];
    const float *rope_sin_ptr = rope_sin ? rope_sin.get().data<float>() : nullptr;
    const float *rope_cos_ptr = rope_cos ? rope_cos.get().data<float>() : nullptr;
    cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
    size_t workspace_size = 0;
    CUINFER_CHECK(cuinferGetFmhaFwdMergedFuseRopeWorkspaceSize(num_tokens,
                                                               num_heads,
                                                               num_kv_heads,
                                                               head_dim,
                                                               q_rope,
                                                               k_rope,
                                                               v_rope,
                                                               data_type,
                                                               data_type,
                                                               data_type,
                                                               &workspace_size));
    auto* allocator = paddle::GetAllocator(qkv.place());
    phi::Allocator::AllocationPtr tmp_workspace = allocator->Allocate(workspace_size);
    void* workspace_ptr = tmp_workspace->ptr();
    cuinferTensorDescriptor_t qkv_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      qkv_desc,
      data_type,
      3,
      std::vector<int>({num_tokens, num_total_heads, head_dim}).data(),
      std::vector<int>({num_total_heads * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t qkv_seqlens_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_seqlens_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      qkv_seqlens_desc,
      CUINFER_DATA_INT32,
      1,
      std::vector<int>({batch_size + 1}).data(),
      std::vector<int>({1}).data()));
    cuinferTensorDescriptor_t block_table_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&block_table_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      block_table_desc,
      CUINFER_DATA_INT32,
      2,
      std::vector<int>({batch_size, block_table_stride}).data(),
      std::vector<int>({block_table_stride, 1}).data()));
    cuinferTensorDescriptor_t o_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&o_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      o_desc,
      data_type,
      3,
      std::vector<int>({num_tokens, num_heads, head_dim}).data(),
      std::vector<int>({num_heads * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t k_cache_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&k_cache_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      k_cache_desc,
      data_type,
      4,
      std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
      std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t v_cache_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&v_cache_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      v_cache_desc,
      data_type,
      4,
      std::vector<int>({num_blocks, num_kv_heads, block_size, head_dim}).data(),
      std::vector<int>({num_kv_heads * block_size * head_dim, block_size * head_dim, head_dim, 1}).data()));
    cuinferTensorDescriptor_t cos_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      cos_desc,
      CUINFER_DATA_FLOAT,
      2,
      std::vector<int>({max_seq_len, head_dim}).data(),
      std::vector<int>({head_dim, 1}).data()));
    cuinferTensorDescriptor_t sin_desc;
    CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc));
    CUINFER_CHECK(cuinferSetTensorNdDescriptor(
      sin_desc,
      CUINFER_DATA_FLOAT,
      2,
      std::vector<int>({max_seq_len, head_dim}).data(),
      std::vector<int>({head_dim, 1}).data()));
    CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle,
                                                   qkv_desc,
                                                   qkv.data(),
                                                   qkv_seqlens_desc,
                                                   cu_seqlens_qkv.data<int32_t>(),
                                                   block_table_desc,
                                                   block_table.data<int32_t>(),
                                                   o_desc,
                                                   out.data(),
                                                   k_cache_desc,
                                                   k_cache.data(),
                                                   v_cache_desc,
                                                   v_cache.data(),
                                                   workspace_ptr,
                                                   workspace_size,
                                                   cos_desc,
                                                   rope_cos_ptr,
                                                   sin_desc,
                                                   rope_sin_ptr,
                                                   batch_size,
                                                   num_heads,
                                                   num_kv_heads,
                                                   head_dim,
                                                   causal,
                                                   scale,
                                                   q_rope,
                                                   k_rope,
                                                   v_rope));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(block_table_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(o_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(k_cache_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(v_cache_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(cos_desc));
    CUINFER_CHECK(cuinferDestroyTensorDescriptor(sin_desc));
 }
 std::vector<paddle::Tensor> PrefillFusedPagedAttn(const paddle::Tensor& qkv,
                                                  paddle::Tensor& k_cache,
                                                  paddle::Tensor& v_cache,
                                                  const paddle::Tensor& block_table,
                                                  const paddle::Tensor& cu_seqlens_qkv,
                                                  const paddle::optional<paddle::Tensor> &rope_sin,
                                                  const paddle::optional<paddle::Tensor> &rope_cos,
                                                  int num_heads,
                                                  int head_dim,
                                                  int num_kv_heads,
                                                  int block_size,
                                                  int max_seq_len,
                                                  float scale,
                                                  bool causal,
                                                  bool q_rope,
                                                  bool k_rope,
                                                  bool v_rope) {
    const auto dtype = qkv.dtype();
    auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place());
    switch (dtype) {
        case paddle::DataType::BFLOAT16:
            PrefillFusedPagedAttnKernel<paddle::DataType::BFLOAT16>(qkv,
                                                                    k_cache,
                                                                    v_cache,
                                                                    block_table,
                                                                    cu_seqlens_qkv,
                                                                    rope_sin,
                                                                    rope_cos,
                                                                    num_heads,
                                                                    head_dim,
                                                                    num_kv_heads,
                                                                    block_size,
                                                                    max_seq_len,
                                                                    scale,
                                                                    causal,
                                                                    q_rope,
                                                                    k_rope,
                                                                    v_rope,
                                                                    out);
            break;
        case paddle::DataType::FLOAT16:
            PrefillFusedPagedAttnKernel<paddle::DataType::FLOAT16>(qkv,
                                                                   k_cache,
                                                                   v_cache,
                                                                   block_table,
                                                                   cu_seqlens_qkv,
                                                                   rope_sin,
                                                                   rope_cos,
                                                                   num_heads,
                                                                   head_dim,
                                                                   num_kv_heads,
                                                                   block_size,
                                                                   max_seq_len,
                                                                   scale,
                                                                   causal,
                                                                   q_rope,
                                                                   k_rope,
                                                                   v_rope,
                                                                   out);
            break;
        default:
            PD_THROW("Unsupported data type for Paged attn");
    }
    return {out};
 }
 std::vector<std::vector<int64_t>> PrefillFusedPagedAttnInferShape(const std::vector<int64_t>& qkv_shape,
 				   		                                                    const std::vector<int64_t>& k_cache_shape,
                                                                  const std::vector<int64_t>& v_cache_shape,
                                                                  const std::vector<int64_t>& block_table_shape,
                                                                  const std::vector<int64_t>& cu_seqlens_qkv_shape,
                                                                  const std::vector<int64_t>& rope_sin_shape,
                                                                  const std::vector<int64_t>& rope_cos_shape,
                                                                  int num_heads,
                                                                  int head_dim,
                                                                  int num_kv_heads,
                                                                  int block_size,
                                                                  int max_seq_len,
                                                                  float scale,
                                                                  bool causal,
                                                                  bool q_rope,
                                                                  bool k_rope,
                                                                  bool v_rope) {
    return {{qkv_shape[0], num_heads * head_dim}};
 }
 std::vector<paddle::DataType> PrefillFusedPagedAttnInferDtype(const paddle::DataType& qkv_dtype) {
    return {qkv_dtype};
 }
 PD_BUILD_STATIC_OP(prefill_fused_paged_attn)
    .Inputs({"qkv", "k_cache", "v_cache", "block_table", "cu_seqlens_qkv",
             paddle::Optional("rope_sin"), paddle::Optional("rope_cos")})
    .Outputs({"out"})
    .Attrs({"num_heads:int",
            "head_dim:int",
            "num_kv_heads:int",
            "block_size:int",
            "max_seq_len:int",
            "scale:float",
 	          "causal:bool",
 	          "q_rope:bool",
            "k_rope:bool",
            "v_rope:bool"})
    .SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn))
    .SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype));
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -536,6 +536,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                "iluvatar_ops/moe_dispatch.cu",
                "iluvatar_ops/moe_reduce.cu",
                "iluvatar_ops/paged_attn.cu",
                "iluvatar_ops/prefill_fused_attn.cu",
                "iluvatar_ops/mixed_fused_attn.cu",
                "iluvatar_ops/w8a16_group_gemm.cu",
                "iluvatar_ops/runtime/iluvatar_context.cc",
            ],
--- a/docs/get_started/installation/iluvatar_gpu.md
+++ b/docs/get_started/installation/iluvatar_gpu.md
@@ -1,5 +1,4 @@
 # Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
 The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. Running the latest ERNIE4.5 300B model on the GSM8K dataset takes about 6.3 hours.
 ## Machine Preparation
 First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations:
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
 ### Install paddle
 ```bash
-pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
 pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
 ```
 For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -78,7 +77,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
 # load the model
-llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, block_size=16, quantization='wint8')
+llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8')
 # Perform batch inference
 outputs = llm.generate(prompts, sampling_params)
@@ -390,7 +389,7 @@ export INFERENCE_MSG_QUEUE_ID=232132
 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
 export FD_SAMPLING_CLASS=rejection
-python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --static-decode-blocks 0 --quantization wint8
+python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
 ```
 4. Running the Script
@@ -403,10 +402,10 @@ After the service is ready, open another terminal and run:
 ```bash
 python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
 ```
-It takes about 6.3 hours to run the GSM8K dataset.
+It takes about 4.8 hours to run the GSM8K dataset.
 ```
-Accuracy: 0.964
+Accuracy: 0.962
 Invaild: 0.000
-Latency: 22918.186 s
+Latency: 17332.728 s
 ```
--- a/docs/zh/get_started/installation/iluvatar_gpu.md
+++ b/docs/zh/get_started/installation/iluvatar_gpu.md
@@ -1,12 +1,11 @@
 # 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
 该软件的当前版本仅作为Iluvatar CoreX与大型模型的Fastdeploy推理框架相结合的演示。在GSM8K数据集上运行最新的ERNIE4.5 300B模型大约需要6.3小时。
 ## 准备机器
-首先您需要准备以下配置的机器
+首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器：
 | CPU | 内存 | 天数 | 硬盘|
 |-----|------|-----|-----|
-| x86 | 1TB| 8xBI150| 1TB|
+| x86 | 1TB| 16xBI150| 1TB|
 目前需要将完整模型 load 到 host memory 中，需要需要大于 600GB 的 host memory，后续版本会优化。
@@ -30,7 +29,7 @@ docker exec -it paddle_infer bash
 ### 安装paddle
 ```bash
-pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
 pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
 ```
 获取Paddle的最新安装版本： [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
@@ -77,7 +76,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
 # 加载模型
-llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
+llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8')
 # 批量进行推理（llm内部基于资源情况进行请求排队、动态插入处理）
 outputs = llm.generate(prompts, sampling_params)
@@ -132,3 +131,281 @@ Now, let's break down each step:
 **Step 3: Drawing the
 The largest ocean is  the Pacific Ocean, covering an area of approximately â¦ [3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
 ```
 ## 在GSM8K数据集上运行ernie4.5 300B模型
 1. 下载GSM8K数据集
 ```bash
 wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
 ```
 2. 准备`bench_gsm8k.py`
 ```python
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """
 # adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
 import argparse
 import ast
 import json
 import re
 import time
 from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import requests
 from tqdm import tqdm
 INVALID = -9999999
 def call_generate(prompt, **kwargs):
    """
    Generates response based on the input prompt.
    Args:
        prompt (str): The input prompt text.
        **kwargs: Keyword arguments, including server IP address and port number.
    Returns:
        str: The response generated based on the prompt.
    """
    url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
        "messages": [
            {
                "role": "user",
                "content": prompt,
            }
        ],
        "temperature": 0.6,
        "max_tokens": 2047,
        "top_p": 0.95,
        "do_sample": True,
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    out = response.json()
    return out["choices"][0]["message"]["content"]
 def get_one_example(lines, i, include_answer):
    """
    Retrieves a question-answer example from the given list of text lines.
    Args:
        lines (list of dict): A list of question-answer pairs.
        i (int): The index of the question-answer pair to retrieve from lines.
        include_answer (bool): Whether to include the answer in the returned string.
    Returns:
        str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
    """
    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
    if include_answer:
        ret += " " + lines[i]["answer"]
    return ret
 def get_few_shot_examples(lines, k):
    """
    Selects k examples from the given list of text lines and concatenates them into a single string.
    Args:
        lines (list): A list containing text lines.
        k (int): The number of examples to select.
    Returns:
        str: A string composed of k examples, separated by two newline characters.
    """
    ret = ""
    for i in range(k):
        ret += get_one_example(lines, i, True) + "\n\n"
    return ret
 def get_answer_value(answer_str):
    """
    Extracts numerical values from an answer string and returns them.
    Args:
        answer_str (str): The string containing the answer.
    Returns:
        The extracted numerical value; returns "INVALID" if extraction fails.
    """
    answer_str = answer_str.replace(",", "")
    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
        return ast.literal_eval(numbers[-1])
    except SyntaxError:
        return INVALID
 def read_jsonl(filename: str):
    """
    Reads a JSONL file.
    Args:
        filename (str): Path to the JSONL file.
    Yields:
        dict: A dictionary object corresponding to each line in the JSONL file.
    """
    with open(filename) as fin:
        for line in fin:
            if line.startswith("#"):
                continue
            yield json.loads(line)
 def main(args):
    """
    Process inputs and generate answers by calling the model in parallel using a thread pool.
    Args:
        args (argparse.Namespace):
            - num_questions (int): Number of questions to process.
            - num_shots (int): Number of few-shot learning examples.
            - ip (str): IP address of the model service.
            - port (int): Port number of the model service.
            - parallel (int): Number of questions to process in parallel.
            - result_file (str): File path to store the results.
    Returns:
        None
    """
    # Read data
    filename = "test.jsonl"
    lines = list(read_jsonl(filename))
    # Construct prompts
    num_questions = args.num_questions
    num_shots = args.num_shots
    few_shot_examples = get_few_shot_examples(lines, num_shots)
    questions = []
    labels = []
    for i in range(len(lines[:num_questions])):
        questions.append(get_one_example(lines, i, False))
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
    states = [None] * len(labels)
    # Use thread pool
    def get_one_answer(i):
        answer = call_generate(
            prompt=few_shot_examples + questions[i],
            # stop=["Question", "Assistant:", "<|separator|>"],
            ip=args.ip,
            port=args.port,
        )
        states[i] = answer
    tic = time.time()
    if args.parallel == 1:
        for i in tqdm(range(len(questions))):
            get_one_answer(i)
    else:
        with ThreadPoolExecutor(args.parallel) as executor:
            list(
                tqdm(
                    executor.map(get_one_answer, list(range(len(questions)))),
                    total=len(questions),
                )
            )
    latency = time.time() - tic
    preds = []
    for i in range(len(states)):
        preds.append(get_answer_value(states[i]))
    # Compute accuracy
    acc = np.mean(np.array(preds) == np.array(labels))
    invalid = np.mean(np.array(preds) == INVALID)
    # Print results
    print(f"Accuracy: {acc:.3f}")
    print(f"Invalid: {invalid:.3f}")
    print(f"Latency: {latency:.3f} s")
    with open(args.result_file, "a") as fout:
        value = {
            "task": "gsm8k",
            "backend": "paddlepaddle",
            "num_gpus": 1,
            "latency": round(latency, 3),
            "accuracy": round(acc, 3),
            "num_requests": args.num_questions,
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
            },
        }
        fout.write(json.dumps(value) + "\n")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--ip", type=str, default="127.0.0.1")
    parser.add_argument("--port", type=str, default="8188")
    parser.add_argument("--num-shots", type=int, default=10)
    parser.add_argument("--data-path", type=str, default="test.jsonl")
    parser.add_argument("--num-questions", type=int, default=1319)
    parser.add_argument("--result-file", type=str, default="result.jsonl")
    parser.add_argument("--parallel", type=int, default=1)
    args = parser.parse_args()
    main(args)
 ```
 3. 准备`run_bench.sh`
 ```bash
 #!/bin/bash
 export PADDLE_XCCL_BACKEND=iluvatar_gpu
 export INFERENCE_MSG_QUEUE_ID=232132
 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
 export FD_SAMPLING_CLASS=rejection
 python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8
 ```
 4. 运行脚本
 首先打开一个终端执行服务端命令:
 ```bash
 ./run_bench.sh
 ```
 等服务起好后，在打开另一个终端执行客户端命令:
 ```bash
 python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8
 ```
 推理整个GSM8K数据集大概需要4.8个小时。
 ```
 Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1186,9 +1186,7 @@ class CacheConfig:
            self.kv_cache_ratio = 1.0
        else:
            self.kv_cache_ratio = 0.75
-        self.enc_dec_block_num = (
+        self.enc_dec_block_num = 0 if current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
            0 if current_platform.is_iluvatar() or current_platform.is_maca() else envs.FD_ENC_DEC_BLOCK_NUM
        )
        self.prealloc_dec_block_slot_num_threshold = 12
        self.cache_dtype = "bfloat16"
        self.model_cfg = None
--- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
@@ -16,13 +16,11 @@
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from math import sqrt
 from typing import TYPE_CHECKING, Optional
 import paddle
 from paddle.nn.functional.flash_attention import flash_attn_unpadded
 from fastdeploy.config import FDConfig
 from fastdeploy.model_executor.layers.attention.attention import Attention
@@ -30,7 +28,11 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
    AttentionBackend,
    AttentionMetadata,
 )
-from fastdeploy.model_executor.ops.iluvatar import paged_attention
+from fastdeploy.model_executor.ops.iluvatar import (
    mixed_fused_paged_attention,
    paged_attention,
    prefill_fused_paged_attention,
 )
 if TYPE_CHECKING:
    from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -42,26 +44,7 @@ class IluvatarAttentionMetadata(AttentionMetadata):
    IluvatarAttentionMetadata
    """
    # flash_attn metadata
    cu_seqlens_q: Optional[paddle.Tensor] = None
    cu_seqlens_k: Optional[paddle.Tensor] = None
    fixed_seed_offset: Optional[paddle.Tensor] = None
    attn_mask: Optional[paddle.Tensor] = None
    attn_mask_start_row_indices: Optional[paddle.Tensor] = None
    dropout: float = 0.0
    causal: bool = True
    return_softmax: bool = False
    rng_name: str = ""
    # paged_attn metadata
    block_tables: Optional[paddle.Tensor] = None
    seq_lens: Optional[paddle.Tensor] = None
    num_kv_heads: int = 1
    scale: float = 1.0
    block_size: int = 1
    max_context_len: int = 1
    alibi_slopes: Optional[paddle.Tensor] = None
    # causal: bool = True
    window_left: int = -1
    window_right: int = -1
    softcap: float = 0.0
@@ -88,55 +71,44 @@ class IluvatarAttnBackend(AttentionBackend):
    def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_dim: int):
        super().__init__()
        self.attention_metadata = IluvatarAttentionMetadata()
-        self.attention_metadata.block_size = fd_config.parallel_config.block_size
+        self.block_size = fd_config.parallel_config.block_size
-        assert (
+        assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
-            fd_config.parallel_config.enc_dec_block_num == 0
+        self.max_context_len = fd_config.parallel_config.max_model_len
-        ), f"Iluvatar does not support yet, {fd_config.parallel_config.enc_dec_block_num}"
+        self.causal = getattr(fd_config.model_config, "causal", True)
        assert self.attention_metadata.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
        self.attention_metadata.max_context_len = fd_config.parallel_config.max_model_len
        self.attention_metadata.causal = getattr(fd_config.model_config, "causal", True)
        self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
        self.use_speculate = self.speculate_method is not None
-        self.attention_metadata.num_kv_heads = kv_num_heads
+        self.num_kv_heads = kv_num_heads
        self.attention_metadata.dropout = fd_config.model_config.hidden_dropout_prob
        self.num_heads = num_heads
        self.total_num_heads = num_heads + 2 * kv_num_heads
        self.head_dim = head_dim
-        self.hidden_dim = num_heads * head_dim
+        self.hidden_dim = fd_config.model_config.hidden_size
        self.total_hidden_dim = self.total_num_heads * head_dim
        # note: scale need to change if using MLA
-        self.attention_metadata.scale = 1.0 / sqrt(head_dim)
+        self.scale = 1.0 / sqrt(head_dim)
        self.num_layers = fd_config.model_config.num_hidden_layers
        self.dtype = paddle.get_default_dtype()
        self.record_block_table_metadata = {}
        self.enable_fused_attention = int(os.getenv("FD_ILUVATAR_ENABLE_FUSED_ATTN", 1))
    def init_attention_metadata(self, forward_meta: ForwardMeta):
        """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
        self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
        self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
        self.prefill_info_dict = {}
        self.decode_info_dict = {}
-
+        self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
-        prefill_non_zeros_ids = forward_meta.seq_lens_this_time > 1
+        self.decode_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_decoder)[0]
        decode_non_zeros_ids = forward_meta.seq_lens_this_time == 1
        self.prefill_info_dict["batch_ids"] = paddle.where(prefill_non_zeros_ids)[0]
        self.decode_info_dict["batch_ids"] = paddle.where(decode_non_zeros_ids)[0]
        self.prefill_len = len(self.prefill_info_dict["batch_ids"])
        self.decode_len = len(self.decode_info_dict["batch_ids"])
        # only prefill
        if self.decode_len == 0:
            cu_seq_ids = list(range(self.prefill_len + 1))
            self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids]
            self.mixed = False
        # only decode
        elif self.prefill_len == 0:
-            pass
+            self.mixed = False
        # both prefill and decode
        else:
-            prefill_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[prefill_non_zeros_ids])
+            self.mixed = True
-            decode_num_tokens = paddle.sum(forward_meta.seq_lens_this_time[decode_non_zeros_ids])
+            self.prefill_num_tokens = paddle.sum(forward_meta.seq_lens_encoder).item()
            self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros(
                [self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype
            )
@@ -145,36 +117,30 @@ class IluvatarAttnBackend(AttentionBackend):
            ]
            self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
-            self.prefill_qkv = paddle.zeros([prefill_num_tokens, self.total_hidden_dim], dtype=self.dtype)
+            self.tmp_buffer = paddle.zeros(
-            self.decode_qkv = paddle.zeros([decode_num_tokens, self.total_hidden_dim], dtype=self.dtype)
+                [self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype
            self.merged_output = paddle.zeros(
                [prefill_num_tokens + decode_num_tokens, self.num_heads, self.head_dim], dtype=self.dtype
            )
-            prefill_start, decode_start, start = 0, 0, 0
+            prefill_start, decode_start, start = 0, self.prefill_num_tokens, 0
            non_zeros_ids = forward_meta.seq_lens_this_time != 0
            non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids]
            end = non_zeros_seq_lens[0]
            if end > 1:
                last_stage = "prefill"
                prefill_end = end
-                decode_end = 0
+                decode_end = decode_start
            else:
                last_stage = "decode"
                prefill_end = 0
-                decode_end = end
+                decode_end = decode_start + end
-            self.prefill_info_dict["id_group"] = []
+            self.id_group = []
-            self.prefill_info_dict["reverse_id_group"] = []
+            self.reverse_id_group = []
            self.decode_info_dict["id_group"] = []
            self.decode_info_dict["reverse_id_group"] = []
            self.record_stages = []
            for seq_len in non_zeros_seq_lens[1:]:
                if seq_len > 1:
                    if last_stage == "decode":
-                        self.record_stages.append((last_stage, len(self.decode_info_dict["id_group"])))
+                        self.id_group.append((decode_start, decode_end))
-                        self.decode_info_dict["id_group"].append((decode_start, decode_end))
+                        self.reverse_id_group.append((start, end))
                        self.decode_info_dict["reverse_id_group"].append((start, end))
                        decode_start = decode_end
                        start = end
                        last_stage = "prefill"
@@ -182,9 +148,8 @@ class IluvatarAttnBackend(AttentionBackend):
                    end += seq_len
                else:
                    if last_stage == "prefill":
-                        self.record_stages.append((last_stage, len(self.prefill_info_dict["id_group"])))
+                        self.id_group.append((prefill_start, prefill_end))
-                        self.prefill_info_dict["id_group"].append((prefill_start, prefill_end))
+                        self.reverse_id_group.append((start, end))
                        self.prefill_info_dict["reverse_id_group"].append((start, end))
                        prefill_start = prefill_end
                        start = end
                        last_stage = "decode"
@@ -192,13 +157,11 @@ class IluvatarAttnBackend(AttentionBackend):
                    end += seq_len
            if prefill_start < prefill_end:
-                self.record_stages.append(("prefill", len(self.prefill_info_dict["id_group"])))
+                self.id_group.append((prefill_start, prefill_end))
-                self.prefill_info_dict["id_group"].append((prefill_start, prefill_end))
+                self.reverse_id_group.append((start, end))
                self.prefill_info_dict["reverse_id_group"].append((start, end))
            if decode_start < decode_end:
-                self.record_stages.append(("decode", len(self.decode_info_dict["id_group"])))
+                self.id_group.append((decode_start, decode_end))
-                self.decode_info_dict["id_group"].append((decode_start, decode_end))
+                self.reverse_id_group.append((start, end))
                self.decode_info_dict["reverse_id_group"].append((start, end))
    def get_attntion_meta(self):
        """get_attntion_meta"""
@@ -214,206 +177,20 @@ class IluvatarAttnBackend(AttentionBackend):
        """
        return (
            max_num_blocks,
-            self.attention_metadata.num_kv_heads,
+            self.num_kv_heads,
-            self.attention_metadata.block_size,
+            self.block_size,
            self.head_dim,
        )
-    def prefill_update_kv_cache(
+    def transpose(self, hidden_states):
-        self, k, v, k_cache_id: int, v_cache_id: int, layer_id: int, forward_meta: ForwardMeta, prefill_batch_ids: list
+        for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
-    ):
+            self.tmp_buffer[ids[0] : ids[1], :] = hidden_states[reverse_ids[0] : reverse_ids[1], :]
-        # [num_tokens, num_kv_heads, head_dim] -> [num_kv_heads, num_tokens, head_dim]
+        return self.tmp_buffer
        trans_k = k.transpose([1, 0, 2]).contiguous()
        trans_v = v.transpose([1, 0, 2]).contiguous()
        tensor_start = 0
        for batch_idx in prefill_batch_ids:
            seq_len = forward_meta.seq_lens_this_time[batch_idx]
-            tensor_end = tensor_start + seq_len
+    def reverse_transpose(self, hidden_states):
-            slice_trans_k = trans_k[:, tensor_start:tensor_end, :]
+        for ids, reverse_ids in zip(self.id_group, self.reverse_id_group):
-            slice_trans_v = trans_v[:, tensor_start:tensor_end, :]
+            self.tmp_buffer[reverse_ids[0] : reverse_ids[1], :] = hidden_states[ids[0] : ids[1], :]
-
+        return self.tmp_buffer
            cur_block_tables = forward_meta.block_tables[batch_idx]
            cur_used_block_tables = cur_block_tables[cur_block_tables != -1]
            cache_start = 0
            cur_used_num_blocks = cur_used_block_tables.shape[0]
            for i, block_id in enumerate(cur_used_block_tables):
                # last block: seq_len - cache_start <= block_size
                if i == cur_used_num_blocks - 1:
                    cache_end = seq_len - cache_start
                    assert cache_end <= self.attention_metadata.block_size
                    paddle.assign(
                        slice_trans_k[:, cache_start:seq_len, :],
                        output=forward_meta.caches[k_cache_id][block_id, :, 0:cache_end, :],
                    )
                    paddle.assign(
                        slice_trans_v[:, cache_start:seq_len, :],
                        output=forward_meta.caches[v_cache_id][block_id, :, 0:cache_end, :],
                    )
                    if layer_id == self.num_layers - 1:
                        self.record_block_table_metadata[batch_idx] = {
                            "block_id": block_id.item(),
                            "cache_end": cache_end.item(),
                        }
                # non last block: seq_lens_this_time > block_size
                else:
                    assert seq_len > self.attention_metadata.block_size
                    cache_end = cache_start + self.attention_metadata.block_size
                    paddle.assign(
                        slice_trans_k[:, cache_start:cache_end, :], output=forward_meta.caches[k_cache_id][block_id]
                    )
                    paddle.assign(
                        slice_trans_v[:, cache_start:cache_end, :], output=forward_meta.caches[v_cache_id][block_id]
                    )
                    cache_start += self.attention_metadata.block_size
            tensor_start = tensor_end
    def get_splited_qkv(
        self, qkv: paddle.Tensor, forward_meta: ForwardMeta, cu_seqlens_q: paddle.Tensor, batch_ids=None
    ):
        q_end = self.hidden_dim
        k_end = q_end + self.attention_metadata.num_kv_heads * self.head_dim
        v_end = k_end + self.attention_metadata.num_kv_heads * self.head_dim
        assert v_end == qkv.shape[-1], f"Shape mismatch: {v_end} vs {qkv.shape[-1]}"
        assert qkv.shape[0] == cu_seqlens_q[-1], f"Shape mismatch: {qkv.shape[0]} vs {cu_seqlens_q[-1]}"
        if batch_ids is None:
            batch_ids = list(range(forward_meta.seq_lens_this_time.shape[0]))
        q = qkv[..., 0:q_end]
        k = qkv[..., q_end:k_end]
        v = qkv[..., k_end:v_end]
        q = q.view([-1, self.num_heads, self.head_dim])
        k = k.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
        v = v.view([-1, self.attention_metadata.num_kv_heads, self.head_dim])
        for idx in range(len(cu_seqlens_q) - 1):
            batch_idx = batch_ids[idx]
            seq_len_i = forward_meta.seq_lens_this_time[batch_idx]
            if seq_len_i == 0:
                continue
            cached_kv_len = forward_meta.seq_lens_decoder[batch_idx][0]
            cu_seq_start_q = cu_seqlens_q[idx]
            cu_seq_end_q = cu_seqlens_q[idx + 1]
            # forward_meta.rotary_embs is [2, 1, S, 1, D]
            if forward_meta.rotary_embs is not None:
                cos = forward_meta.rotary_embs[0, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
                sin = forward_meta.rotary_embs[1, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :]
                q[cu_seq_start_q:cu_seq_end_q] = apply_rope(q[cu_seq_start_q:cu_seq_end_q], cos, sin)
                k[cu_seq_start_q:cu_seq_end_q] = apply_rope(k[cu_seq_start_q:cu_seq_end_q], cos, sin)
        return q, k, v
    def split_pd_qkv(self, qkv):
        for ids, reverse_ids in zip(self.prefill_info_dict["id_group"], self.prefill_info_dict["reverse_id_group"]):
            self.prefill_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
        for ids, reverse_ids in zip(self.decode_info_dict["id_group"], self.decode_info_dict["reverse_id_group"]):
            self.decode_qkv[ids[0] : ids[1], :] = qkv[reverse_ids[0] : reverse_ids[1], :]
        return self.prefill_qkv, self.decode_qkv
    def merge_pd_output(self, prefill_out, decode_out):
        for stage, idx in self.record_stages:
            if stage == "prefill":
                ids = self.prefill_info_dict["id_group"][idx]
                reverse_ids = self.prefill_info_dict["reverse_id_group"][idx]
                self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = prefill_out[ids[0] : ids[1], :, :]
            else:
                ids = self.decode_info_dict["id_group"][idx]
                reverse_ids = self.decode_info_dict["reverse_id_group"][idx]
                self.merged_output[reverse_ids[0] : reverse_ids[1], :, :] = decode_out[ids[0] : ids[1], :, :]
        return self.merged_output
    def forward_prefill(self, prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
        prefill_q, prefill_k, prefill_v = self.get_splited_qkv(
            prefill_qkv,
            forward_meta,
            self.prefill_info_dict["cu_seqlens_q"],
            batch_ids=self.prefill_info_dict["batch_ids"],
        )
        prefill_out = flash_attn_unpadded(
            prefill_q,
            prefill_k,
            prefill_v,
            cu_seqlens_q=self.prefill_info_dict["cu_seqlens_q"],
            cu_seqlens_k=self.prefill_info_dict["cu_seqlens_q"],
            max_seqlen_q=self.attention_metadata.max_context_len,
            max_seqlen_k=self.attention_metadata.max_context_len,
            scale=self.attention_metadata.scale,
            dropout=self.attention_metadata.dropout,
            causal=self.attention_metadata.causal,
            return_softmax=self.attention_metadata.return_softmax,
        )[0]
        self.prefill_update_kv_cache(
            prefill_k, prefill_v, k_cache_id, v_cache_id, layer_id, forward_meta, self.prefill_info_dict["batch_ids"]
        )
        return prefill_out
    def forward_decode(self, decode_qkv, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
        k_cache = forward_meta.caches[k_cache_id]
        v_cache = forward_meta.caches[v_cache_id]
        if self.enable_fused_attention:
            rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
            rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
            decode_out = paged_attention(
                decode_qkv.view([-1, self.total_num_heads, self.head_dim]),
                k_cache,
                v_cache,
                block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
                seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
                num_kv_heads=self.attention_metadata.num_kv_heads,
                scale=self.attention_metadata.scale,
                block_size=self.attention_metadata.block_size,
                max_context_len=self.attention_metadata.max_context_len,
                alibi_slopes=self.attention_metadata.alibi_slopes,
                causal=self.attention_metadata.causal,
                window_left=self.attention_metadata.window_left,
                window_right=self.attention_metadata.window_right,
                softcap=self.attention_metadata.softcap,
                use_cuda_graph=self.attention_metadata.use_cuda_graph,
                use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
                merged_qkv=True,
                k=decode_qkv,
                v=decode_qkv,
                rope_sin=rope_sin,
                rope_cos=rope_cos,
            )
        else:
            decode_q, decode_k, decode_v = self.get_splited_qkv(
                decode_qkv,
                forward_meta,
                self.decode_info_dict["cu_seqlens_q"],
                batch_ids=self.decode_info_dict["batch_ids"],
            )
            decode_out = paged_attention(
                decode_q,
                k_cache,
                v_cache,
                block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
                seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
                num_kv_heads=self.attention_metadata.num_kv_heads,
                scale=self.attention_metadata.scale,
                block_size=self.attention_metadata.block_size,
                max_context_len=self.attention_metadata.max_context_len,
                alibi_slopes=self.attention_metadata.alibi_slopes,
                causal=self.attention_metadata.causal,
                window_left=self.attention_metadata.window_left,
                window_right=self.attention_metadata.window_right,
                softcap=self.attention_metadata.softcap,
                use_cuda_graph=self.attention_metadata.use_cuda_graph,
                use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
                k=decode_k,
                v=decode_v,
            )
        return decode_out
    def forward_mixed(
        self,
@@ -429,23 +206,84 @@ class IluvatarAttnBackend(AttentionBackend):
        """
        forward_mixed
        """
        assert not self.use_speculate, "IluvatarAttnBackend cannot support speculate now"
        layer_id = layer.layer_id
        k_cache_id = layer_id * 2
        v_cache_id = k_cache_id + 1
-        q_dim = qkv.dim()
+        k_cache = forward_meta.caches[k_cache_id]
-        assert q_dim == 2
+        v_cache = forward_meta.caches[v_cache_id]
        if self.decode_len == 0:
-            output = self.forward_prefill(qkv, layer_id, k_cache_id, v_cache_id, forward_meta)
+            output = prefill_fused_paged_attention(
-
+                qkv,
                k_cache,
                v_cache,
                block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
                cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
                num_heads=self.num_heads,
                head_dim=self.head_dim,
                num_kv_heads=self.num_kv_heads,
                block_size=self.block_size,
                max_seq_len=self.max_context_len,
                scale=self.scale,
                causal=self.causal,
                q_rope=True,
                k_rope=True,
                v_rope=False,
                rope_sin=self.rope_sin,
                rope_cos=self.rope_cos,
            )
        elif self.prefill_len == 0:
-            output = self.forward_decode(qkv, k_cache_id, v_cache_id, forward_meta)
+            output = paged_attention(
                qkv,
                k_cache,
                v_cache,
                block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
                seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
                num_heads=self.num_heads,
                head_dim=self.head_dim,
                num_kv_heads=self.num_kv_heads,
                scale=self.scale,
                block_size=self.block_size,
                max_context_len=self.max_context_len,
                alibi_slopes=self.attention_metadata.alibi_slopes,
                causal=self.causal,
                window_left=self.attention_metadata.window_left,
                window_right=self.attention_metadata.window_right,
                softcap=self.attention_metadata.softcap,
                use_cuda_graph=self.attention_metadata.use_cuda_graph,
                use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
                merged_qkv=True,
                k=qkv,
                v=qkv,
                rope_sin=self.rope_sin,
                rope_cos=self.rope_cos,
            )
        else:
-            prefill_qkv, decode_qkv = self.split_pd_qkv(qkv)
+            output = mixed_fused_paged_attention(
-            prefill_output = self.forward_prefill(prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta)
+                qkv,
-            decode_output = self.forward_decode(decode_qkv, k_cache_id, v_cache_id, forward_meta)
+                k_cache,
-            output = self.merge_pd_output(prefill_output, decode_output)
+                v_cache,
                prefill_block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :],
                decode_block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :],
                cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"],
                seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1,
                prefill_num_tokens=self.prefill_num_tokens,
                num_heads=self.num_heads,
                head_dim=self.head_dim,
                num_kv_heads=self.num_kv_heads,
                block_size=self.block_size,
                max_seq_len=self.max_context_len,
                scale=self.scale,
                causal=self.causal,
                q_rope=True,
                k_rope=True,
                v_rope=False,
                window_left=self.attention_metadata.window_left,
                window_right=self.attention_metadata.window_right,
                softcap=self.attention_metadata.softcap,
                use_cuda_graph=self.attention_metadata.use_cuda_graph,
                use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi,
                rope_sin=self.rope_sin,
                rope_cos=self.rope_cos,
            )
        output = output.view([-1, self.num_heads * self.head_dim])
        return output
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -83,7 +83,6 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
                expert_idx_per_token,
                self.moe_quant_type,
                used_in_ep_low_latency,
                estimate_total_token_nums,
            )
        return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
            permute_input,
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -53,6 +53,7 @@ from fastdeploy.model_executor.models.model_base import (
 from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
 from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
 from fastdeploy.model_executor.models.utils import WeightMeta
 from fastdeploy.platforms import current_platform
 from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -464,6 +465,9 @@ class Ernie4_5_Model(nn.Layer):
    ):
        hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
        if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
            hidden_states = forward_meta.attn_backend.transpose(hidden_states)
        residual = None
        for i in range(self.num_layers):
            hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
@@ -472,6 +476,9 @@ class Ernie4_5_Model(nn.Layer):
        out = self.norm(hidden_states)
        if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
            out = forward_meta.attn_backend.reverse_transpose(out)
        return out
--- a/fastdeploy/model_executor/ops/iluvatar/init.py
+++ b/fastdeploy/model_executor/ops/iluvatar/init.py
@@ -20,4 +20,8 @@ PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
 import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
 from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn  # noqa: F401
-from .paged_attention import paged_attention  # noqa: F401
+from .paged_attention import (  # noqa: F401
    mixed_fused_paged_attention,
    paged_attention,
    prefill_fused_paged_attention,
 )
--- a/fastdeploy/model_executor/ops/iluvatar/paged_attention.py
+++ b/fastdeploy/model_executor/ops/iluvatar/paged_attention.py
@@ -17,9 +17,15 @@
 import paddle
 try:
-    from fastdeploy.model_executor.ops.iluvatar import paged_attn
+    from fastdeploy.model_executor.ops.iluvatar import (
        mixed_fused_paged_attn,
        paged_attn,
        prefill_fused_paged_attn,
    )
 except ImportError:
    paged_attn = None
    prefill_fused_paged_attn = None
    mixed_fused_paged_attn = None
 def paged_attention(
@@ -28,6 +34,8 @@ def paged_attention(
    v_cache: paddle.Tensor,
    block_tables: paddle.Tensor,
    seq_lens: paddle.Tensor,
    num_heads: int,
    head_dim: int,
    num_kv_heads: int,
    scale: float,
    block_size: int,
@@ -45,7 +53,7 @@ def paged_attention(
    rope_sin: paddle.Tensor = None,
    rope_cos: paddle.Tensor = None,
 ):
-    output = paged_attn(
+    return paged_attn(
        q,
        k_cache,
        v_cache,
@@ -56,6 +64,8 @@ def paged_attention(
        v,
        rope_sin,
        rope_cos,
        num_heads,
        head_dim,
        num_kv_heads,
        scale,
        block_size,
@@ -68,4 +78,99 @@ def paged_attention(
        use_sqrt_alibi,
        merged_qkv,
    )
-    return output[0] if isinstance(output, list) else output
+
 def prefill_fused_paged_attention(
    qkv: paddle.Tensor,
    k_cache: paddle.Tensor,
    v_cache: paddle.Tensor,
    block_tables: paddle.Tensor,
    cu_seqlens_qkv: paddle.Tensor,
    num_heads: int,
    head_dim: int,
    num_kv_heads: int,
    block_size: int,
    max_seq_len: int,
    scale: float,
    causal: bool = True,
    q_rope: bool = True,
    k_rope: bool = True,
    v_rope: bool = False,
    rope_sin: paddle.Tensor = None,
    rope_cos: paddle.Tensor = None,
 ):
    return prefill_fused_paged_attn(
        qkv,
        k_cache,
        v_cache,
        block_tables,
        cu_seqlens_qkv,
        rope_sin,
        rope_cos,
        num_heads,
        head_dim,
        num_kv_heads,
        block_size,
        max_seq_len,
        scale,
        causal,
        q_rope,
        k_rope,
        v_rope,
    )
 def mixed_fused_paged_attention(
    qkv: paddle.Tensor,
    k_cache: paddle.Tensor,
    v_cache: paddle.Tensor,
    prefill_block_tables: paddle.Tensor,
    decode_block_tables: paddle.Tensor,
    cu_seqlens_qkv: paddle.Tensor,
    seq_lens: paddle.Tensor,
    prefill_num_tokens: int,
    num_heads: int,
    head_dim: int,
    num_kv_heads: int,
    block_size: int,
    max_seq_len: int,
    scale: float,
    causal: bool = True,
    q_rope: bool = True,
    k_rope: bool = True,
    v_rope: bool = False,
    window_left: int = -1,
    window_right: int = -1,
    softcap: float = 0.0,
    use_cuda_graph: bool = False,
    use_sqrt_alibi: bool = False,
    rope_sin: paddle.Tensor = None,
    rope_cos: paddle.Tensor = None,
 ):
    return mixed_fused_paged_attn(
        qkv,
        k_cache,
        v_cache,
        prefill_block_tables,
        decode_block_tables,
        cu_seqlens_qkv,
        seq_lens,
        rope_sin,
        rope_cos,
        prefill_num_tokens,
        num_heads,
        head_dim,
        num_kv_heads,
        block_size,
        max_seq_len,
        scale,
        causal,
        q_rope,
        k_rope,
        v_rope,
        window_left,
        window_right,
        softcap,
        use_cuda_graph,
        use_sqrt_alibi,
    )
--- a/scripts/run_ci_iluvatar.sh
+++ b/scripts/run_ci_iluvatar.sh
@@ -13,10 +13,10 @@ python -m pip install -r requirements_iluvatar.txt
 echo "uninstall org"
 python -m pip uninstall paddlepaddle -y
 python -m pip uninstall paddle-iluvatar-gpu -y
-python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+# python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
-# TODO: Change to open access URL
+# python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
-python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-# python -m pip install /data1/fastdeploy/packages/paddle_iluvatar_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
 # Patch, remove if image updated
 cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
 echo "build whl"
--- a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
+++ b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
@@ -1,4 +1,7 @@
 from fastdeploy import LLM, SamplingParams
 from fastdeploy.utils import set_random_seed
 set_random_seed(123)
 prompts = [
    "Hello, my name is",
@@ -12,7 +15,6 @@ llm = LLM(
    model="/data1/fastdeploy/ERNIE_300B_4L",
    tensor_parallel_size=8,
    max_model_len=8192,
    static_decode_blocks=0,
    quantization="wint8",
    block_size=16,
 )
@@ -27,14 +29,14 @@ assert outputs[0].outputs.token_ids == [
    59335,
    68170,
    183,
-    97404,
+    49080,
-    100088,
+    94717,
-    36310,
+    82966,
-    95633,
+    99140,
-    95913,
+    31615,
-    41459,
+    51497,
-    95049,
+    94851,
-    94970,
+    60764,
-    96840,
+    10889,
    2,
 ], f"{outputs[0].outputs.token_ids}"