[XPU] support XPU VL model inference (#4030)

* [XPU] support XPU VL model inference * fix image op import and device check * rebase develop * fix perf
2025-10-09 02:20:17 +08:00 · 2025-09-25 14:34:15 +08:00
parent e36eccfdad
commit 87179cb744
18 changed files with 1300 additions and 146 deletions
--- a/custom_ops/xpu_ops/src/ops/block_attn.cc
+++ b/custom_ops/xpu_ops/src/ops/block_attn.cc
@@ -41,7 +41,9 @@ std::vector<paddle::Tensor> BlockAttnKernel(
    const paddle::Tensor &encoder_seq_lod_cpu,
    const paddle::Tensor &encoder_batch_map_cpu,
    const paddle::Tensor &decoder_context_len_cpu,
-    const paddle::Tensor &decoder_batch_map_cpu) {
+    const paddle::Tensor &decoder_batch_map_cpu,
    const std::string &pos_emb_type="NORMAL",
    bool rope_3d=false) {
    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
    auto dev_ctx =
        paddle::experimental::DeviceContextPool::Instance().Get(place);
@@ -72,6 +74,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
    int enc_batch = enc_batch_tensor.data<int32_t>()[0];
    int dec_batch = dec_batch_tensor.data<int32_t>()[0];
    int total_enc_len = total_enc_len_tensor.data<int32_t>()[0];
    int rope_max_seqlen = 0;
    int rope_3d_num_seqs = 1;
    if (rope_3d) {
        rope_max_seqlen = rotary_embs.dims()[3];
        rope_3d_num_seqs = rotary_embs.dims()[0];
    } else {
        rope_max_seqlen = rotary_embs.dims()[2];
    }
    auto block_attn_out =
        paddle::full({token_num, hidden_dim}, -1, qkv.type(), qkv.place());
@@ -151,10 +161,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                prefix_lens_vp,        // start_tokens
                param.batch_size,      // batch_size
                1,                     // emb_batch_size
-                rotary_embs.dims()[2], // max_seqlen
+                rope_max_seqlen,       // max_seqlen
                param.head_num, param.kv_head_num, param.head_dim,
                param.max_batch_size, block_size, max_block_per_seq, "BLHD",
-                "HLD", "NORMAL",
+                "HLD", pos_emb_type,
                !p_kcache_perhead_scale.defined()
                    ? nullptr
                    : p_kcache_perhead_scale.data<float>() +
@@ -246,10 +256,10 @@ std::vector<paddle::Tensor> BlockAttnKernel(
            vsl.slot_mapping_vp,   // real_batch
            param.batch_size,      // batch_size
            1,                     // emb_batch_size
-            rotary_embs.dims()[2], // max_seqlen TODO!!double check
+            rope_max_seqlen,       // max_seqlen
            param.head_num, param.kv_head_num, param.head_dim,
            param.max_batch_size, block_size, max_block_per_seq, "BLHD", "HLD",
-            "NORMAL",
+            pos_emb_type,
            !p_kcache_perhead_scale.defined()
                ? nullptr
                : p_kcache_perhead_scale.data<float>() +
@@ -260,7 +270,9 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                      param.kv_head_num, // v_cache_scale_inv
            nullptr,                     // k_cache_zp
            nullptr,                     // v_cache_zp
-            false);                      // b_c8_pc
+            false,                       // b_c8_pc
            rope_3d,                     // rope_3d
            rope_3d_num_seqs);
        XFTBLOCK_CHECK_EQ(ret, api::SUCCESS);
        // attn decode
@@ -314,6 +326,7 @@ PD_BUILD_OP(block_attn)
        "decoder_context_len_cpu",
        "decoder_batch_map_cpu",
    })
    .Attrs({"pos_emb_type:std::string", "rope_3d:bool"})
    .Outputs({"block_attn_out"})
    .SetKernelFn(PD_KERNEL(BlockAttnKernel))
    .SetInferShapeFn(PD_INFER_SHAPE(BlockAttnInferShape))
--- a/custom_ops/xpu_ops/src/ops/get_img_boundaries.cc
+++ b/custom_ops/xpu_ops/src/ops/get_img_boundaries.cc
@@ -0,0 +1,60 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/extension.h"
 std::vector<paddle::Tensor> GetImgBoundaries(const paddle::Tensor& task_input_ids,
                                             const paddle::Tensor& grid_thw,
                                             const int64_t image_patch_id) {
    // All tensor in cpu
    auto input_ids_ptr = task_input_ids.data<int64_t>();
    int64_t seq_lens_origin = task_input_ids.numel();
    auto grid_thw_ptr = grid_thw.data<int64_t>();
    int token_times = 4;
    int token_idx = 0;
    int image_idx = 0;
    std::vector<int> img_boundaries, img_nums;
    img_boundaries.emplace_back(0);
    img_nums.emplace_back(0);
    while (token_idx < seq_lens_origin) {
        if (input_ids_ptr[token_idx] != image_patch_id) {
            do {
                token_idx++;
            } while (token_idx < seq_lens_origin && input_ids_ptr[token_idx] != image_patch_id);
        } else {
            int cur_image_token_len = (grid_thw_ptr[image_idx * 3 + 1] * grid_thw_ptr[image_idx * 3 + 2]) / token_times;
            image_idx++;
            token_idx += cur_image_token_len;
        }
        img_boundaries.emplace_back(token_idx);
        img_nums.emplace_back(image_idx);
    }
    int64_t num_img_boundaries = static_cast<int64_t>(img_boundaries.size());
    auto out = paddle::full({2, num_img_boundaries}, 0, paddle::DataType::INT64, paddle::CPUPlace());
    for (int i = 0; i < num_img_boundaries; i++) {
        out.data<int64_t>()[i] = img_boundaries[i];
        out.data<int64_t>()[num_img_boundaries + i] = img_nums[i];
    }
    return {out};
 }
 PD_BUILD_OP(get_img_boundaries)
    .Inputs({"task_input_ids", "grid_thw"})
    .Attrs({"image_patch_id: int64_t"})
    .Outputs({"img_boundaries"})
    .SetKernelFn(PD_KERNEL(GetImgBoundaries));
--- a/custom_ops/xpu_ops/src/ops/moe_layer.cc
+++ b/custom_ops/xpu_ops/src/ops/moe_layer.cc
@@ -145,7 +145,8 @@ std::vector<paddle::Tensor> MoeLayerKernel(
                                        ? up_gate_proj_weight_scale.get_ptr()->data<float>()
                                        : nullptr),
                xftblock_tw,
-                std::vector<int64_t>{expert_num, inter_dim, hidden_dim});
+                std::vector<int64_t>{expert_num, inter_dim, hidden_dim}
            );
            xdown_proj_w = std::make_shared<xftblock::Tensor>(
                const_cast<TW *>(down_proj_weight.data<TW>()), nullptr,
@@ -153,7 +154,8 @@ std::vector<paddle::Tensor> MoeLayerKernel(
                                        ? down_proj_weight_scale.get_ptr()->data<float>()
                                        : nullptr),
                xftblock_tw,
-                std::vector<int64_t>{expert_num, hidden_dim, outer_dim});
+                std::vector<int64_t>{expert_num, hidden_dim, outer_dim}
            );
        }
        std::shared_ptr<xftblock::Tensor> xup_gate_proj_bias;
        std::shared_ptr<xftblock::Tensor> xdown_proj_bias;
--- a/custom_ops/xpu_ops/src/ops/text_image_gather_scatter.cc
+++ b/custom_ops/xpu_ops/src/ops/text_image_gather_scatter.cc
@@ -0,0 +1,83 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <paddle/phi/backends/xpu/xpu_context.h>
 #include <xft/xdnn_plugin.h>
 #include "paddle/extension.h"
 #include "xpu/plugin.h"
 void TextImageGatherScatter(
            paddle::Tensor& input,
            paddle::Tensor& text_input,
            paddle::Tensor& image_input,
            paddle::Tensor& token_type_ids,
            paddle::Tensor& text_index,
            paddle::Tensor& image_index,
            const bool is_scatter) {
    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
    auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
    auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
    const int64_t token_num = input.dims()[0];
    const int64_t hidden_size = input.dims()[1];
    const int64_t text_token_num = text_input.dims()[0];
    const int64_t image_token_num = image_input.dims()[0];
    switch (input.type()) {
        case paddle::DataType::BFLOAT16: {
            using XPUType = typename XPUTypeTrait<bfloat16>::Type;
            typedef paddle::bfloat16 data_t;
            int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
                xpu_ctx->x_context(),
                reinterpret_cast<XPUType*>(input.data<data_t>()),
                reinterpret_cast<XPUType*>(text_input.data<data_t>()),
                reinterpret_cast<XPUType*>(image_input.data<data_t>()),
                reinterpret_cast<int*>(token_type_ids.data<int>()),
                reinterpret_cast<int*>(text_index.data<int>()),
                reinterpret_cast<int*>(image_index.data<int>()),
                token_num,
                text_token_num,
                image_token_num,
                hidden_size,
                is_scatter
            );
            PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_gather_scatter");
            break;
        }
        default: {
            PD_THROW(
                "NOT supported data type. Only support BFLOAT16. ");
            break;
        }
    }
 }
 PD_BUILD_OP(text_image_gather_scatter)
    .Inputs({"input",
             "text_input",
             "image_input",
             "token_type_ids",
             "text_index",
             "image_index"})
    .Outputs({"text_input_out",
              "image_input_out",
              "text_index_out",
              "image_index_out"})
    .Attrs({"is_scatter:bool"})
    .SetInplaceMap({{"text_input", "text_input_out"},
                    {"image_input", "image_input_out"},
                    {"text_index", "text_index_out"},
                    {"image_index", "image_index_out"}})
    .SetKernelFn(PD_KERNEL(TextImageGatherScatter));
--- a/custom_ops/xpu_ops/src/ops/text_image_index_out.cc
+++ b/custom_ops/xpu_ops/src/ops/text_image_index_out.cc
@@ -0,0 +1,48 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <paddle/phi/backends/xpu/xpu_context.h>
 #include "paddle/extension.h"
 #include "xpu/plugin.h"
 void TextImageIndexOut(
            const paddle::Tensor& token_type_ids,
            const paddle::Tensor& text_index,
            const paddle::Tensor& image_index) {
    if (token_type_ids.type() != paddle::DataType::INT32 || text_index.type()
            != paddle::DataType::INT32 || image_index.type() != paddle::DataType::INT32) {
        PD_THROW("NOT supported data type. Only support BFLOAT16. ");
    }
    phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
    auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
    auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
    const int64_t token_num = token_type_ids.shape()[0];
    int r = baidu::xpu::api::plugin::text_image_index_out(xpu_ctx->x_context(),
                                                          token_type_ids.data<int32_t>(),
                                                          const_cast<int32_t*>(text_index.data<int32_t>()),
                                                          const_cast<int32_t*>(image_index.data<int32_t>()),
                                                          token_num);
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "text_image_index_out");
 }
 PD_BUILD_OP(text_image_index_out)
    .Inputs({"token_type_ids",
             "text_index",
             "image_index"})
    .Outputs({"text_index_out",
              "image_index_out"})
    .SetInplaceMap({{"text_index", "text_index_out"},
                    {"image_index", "image_index_out"}})
    .SetKernelFn(PD_KERNEL(TextImageIndexOut));
--- a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
+++ b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
@@ -140,6 +140,25 @@ DLL_EXPORT int quant2d_per_channel(api::Context *ctx, const TX *x,
                                   const TSCALE *scale_in, TY *y,
                                   TSCALE *scale_out, int64_t m, int64_t n);
 DLL_EXPORT int text_image_index_out(Context* ctx,
                                    const int* token_type_ids,  // x
                                    int* text_index,            // y1
                                    int* image_index,           // y2
                                    const int64_t token_num);
 template <typename T>
 DLL_EXPORT int text_image_gather_scatter(api::Context* ctx,
                                         T* input,
                                         T* text_input,
                                         T* image_input,
                                         int* token_type_ids,
                                         int* text_index,
                                         int* image_index,
                                         int64_t token_num,
                                         int64_t text_token_num,
                                         int64_t image_token_num,
                                         int64_t hidden_size,
                                         bool is_scatter);
 /*--------------------------------------- MTP being --------------------------------------------*/
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/text_image_gather_scatter.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/text_image_gather_scatter.xpu
@@ -0,0 +1,175 @@
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"
 namespace xpu3 {
 namespace plugin {
 template <typename T>
 static __device__ inline void text_image_gather(
        __global_ptr__ T* input,
        __global_ptr__ T* text_input,
        __global_ptr__ T* image_input,
        __global_ptr__ int* token_type_ids,
        __global_ptr__ int* text_index,
        __global_ptr__ int* image_index,
        int64_t token_num,
        int64_t text_token_num,
        int64_t image_token_num,
        int64_t hidden_size,
        T* input_lm) {
    int cid = core_id();
    int clusterid = cluster_id();
    int token_start_cluster;
    int token_end_cluster;
    int token_start_core;
    int token_end_core;
    const int BUFSIZE = 2 * 1024 / sizeof(T);       // 1024 for bf16, 512 for fp32
    // cluster partition
    partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
    if (token_start_cluster >= token_end_cluster) {
        return;
    }
    int rows_cluster = token_end_cluster - token_start_cluster;    // total rows for a cluster
    // core partition
    partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
    int rows_core = token_end_core - token_start_core;    // total rows for a core
    token_start_core += token_start_cluster;
    token_end_core += token_start_cluster;
    int read_len;
    for (int i = token_start_core; i < token_end_core; i += 1) {
        int token_type, text_image_token_idx;
        __global_ptr__ T* text_image_input = nullptr;
        __global_ptr__ int* text_image_index = nullptr;
        GM2LM(token_type_ids + i, &token_type, sizeof(int));
        if (token_type == 0) {
            text_image_input = text_input;
            text_image_index = text_index;
        } else {
            text_image_input = image_input;
            text_image_index = image_index;
        }
        GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
        int input_offset = i * hidden_size;
        int text_image_offset = text_image_token_idx * hidden_size;
        for (int j = 0; j < hidden_size; j += BUFSIZE) {
            read_len = min(hidden_size - j, BUFSIZE);
            GM2LM(text_image_input + text_image_offset + j, input_lm, sizeof(T) * read_len);
            LM2GM(input_lm, input + input_offset + j, sizeof(T) * read_len);
        }
    }
 }
 template <typename T>
 static __device__ inline void text_image_scatter(
        __global_ptr__ T* input,
        __global_ptr__ T* text_input,
        __global_ptr__ T* image_input,
        __global_ptr__ int* token_type_ids,
        __global_ptr__ int* text_index,
        __global_ptr__ int* image_index,
        int64_t token_num,
        int64_t text_token_num,
        int64_t image_token_num,
        int64_t hidden_size,
        T* input_lm) {
    int cid = core_id();
    int clusterid = cluster_id();
    int token_start_cluster;
    int token_end_cluster;
    int token_start_core;
    int token_end_core;
    const int BUFSIZE = 2 * 1024 / sizeof(T);       // 1024 for bf16, 512 for fp32
    // cluster partition
    partition(cluster_id(), cluster_num(), (int)token_num, 1, &token_start_cluster, &token_end_cluster);
    if (token_start_cluster >= token_end_cluster) {
        return;
    }
    int rows_cluster = token_end_cluster - token_start_cluster;    // total rows for a cluster
    // core partition
    partition(core_id(), core_num(), rows_cluster, 1, &token_start_core, &token_end_core);
    int rows_core = token_end_core - token_start_core;    // total rows for a core
    token_start_core += token_start_cluster;
    token_end_core += token_start_cluster;
    int read_len;
    for (int i = token_start_core; i < token_end_core; i += 1) {
        int token_type, text_image_token_idx;
        __global_ptr__ T* text_image_input = nullptr;
        __global_ptr__ int* text_image_index = nullptr;
        GM2LM(token_type_ids + i, &token_type, sizeof(int));
        if (token_type == 0) {
            text_image_input = text_input;
            text_image_index = text_index;
        } else {
            text_image_input = image_input;
            text_image_index = image_index;
        }
        GM2LM(text_image_index + i, &text_image_token_idx, sizeof(int));
        int input_offset = i * hidden_size;
        int text_image_offset = text_image_token_idx * hidden_size;
        for (int j = 0; j < hidden_size; j += BUFSIZE) {
            read_len = min(hidden_size - j, BUFSIZE);
            GM2LM(input + input_offset + j, input_lm, sizeof(T) * read_len);
            LM2GM(input_lm, text_image_input + text_image_offset + j, sizeof(T) * read_len);
        }
    }
 }
 template <typename T>
 __global__ void text_image_gather_scatter(
        T* input,
        T* text_input,
        T* image_input,
        int* token_type_ids,
        int* text_index,
        int* image_index,
        int64_t token_num,
        int64_t text_token_num,
        int64_t image_token_num,
        int64_t hidden_size,
        bool is_scatter) {
    int cid = core_id();
    int ncores = core_num();
    int clusterid = cluster_id();
    int nclusters = cluster_num();
    const int BUFSIZE = 2 * 1024 / sizeof(T);   // 1024 for bf16, 512 for fp32
    __simd__ T input_lm[BUFSIZE];               // 2KB for bf16 and fp32
    if (is_scatter) {
        text_image_scatter(
            input, text_input, image_input, token_type_ids, text_index, image_index,
            token_num, text_token_num, image_token_num, hidden_size, input_lm);
    } else {
        text_image_gather(
            input, text_input, image_input, token_type_ids, text_index, image_index,
            token_num, text_token_num, image_token_num, hidden_size, input_lm);
    }
 }
 #define _XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(T)              \
  template __global__ void text_image_gather_scatter<T>(    \
    T* input,                                               \
    T* text_input,                                          \
    T* image_input,                                         \
    int* token_type_ids,                                    \
    int* text_index,                                        \
    int* image_index,                                       \
    int64_t token_num,                                      \
    int64_t text_token_num,                                 \
    int64_t image_token_num,                                \
    int64_t hidden_size,                                    \
    bool is_scatter);
 _XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);
 }  // namespace plugin
 }  // namespace xpu3
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/text_image_index_out.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/text_image_index_out.xpu
@@ -0,0 +1,97 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 /*
 * copyright (C) 2025 KUNLUNXIN, Inc
 */
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"
 namespace xpu3 {
 namespace plugin {
 static __device__ void do_calc(const _shared_ptr_ int* lm_x, int* lm_y1, int* lm_y2, int64_t size, int& text_count, int& images_count) {
    for (int j = 0; j < size; j++) {
        if (lm_x[j] == 0) {
            lm_y1[j] = text_count;
            text_count += 1;
        } else {
            lm_y2[j] = images_count;
            images_count += 1;
        }
    }
    mfence_lm_sm();
 }
 __global__ void text_image_index_out_kernel(
        const int* token_type_ids,   // x
        int* text_index,             // y1
        int* image_index,            // y2
        const int64_t token_num) {
    const int cid = core_id();
    const int tid = core_id() * cluster_num() + cluster_id();
    const int nthreads = core_num() * cluster_num();
    if (tid >= 1) return;
    constexpr int BUFSIZE = 1024;
    constexpr int READ_MAX_SIZE = BUFSIZE / sizeof(int);
    const int64_t len = token_num;
    __simd__ char buffer0[BUFSIZE * 3];
    __simd__ char buffer1[BUFSIZE * 3];
    __simd__ __shared__ char buffer2[64][BUFSIZE * 2];
    DoublePtr<READ_MAX_SIZE, SmPtr<int>> buffer_ptr_x((SmPtr<int>((_shared_ptr_ int*)buffer2[cid])));
    TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y1((LmPtr<int>((int*)buffer0)));
    TriplePtr<READ_MAX_SIZE, LmPtr<int>> buffer_ptr_y2((LmPtr<int>((int*)buffer1)));
    int64_t buflen = get_1d_buflen(len, nthreads, READ_MAX_SIZE, 64);
    int64_t i = tid * buflen;
    int read_size = 0;
    int offset = nthreads * buflen;
    int text_count = 0;
    int images_count = 0;
    if (i < len) {
        read_size = min<int64_t>(buflen, len - i);
        buffer_ptr_y1.gm_load_async(text_index + tid * buflen, read_size);
        buffer_ptr_y2.gm_load_async(image_index + tid * buflen, read_size);
        buffer_ptr_x.gm_load_async(token_type_ids + tid * buflen, read_size);
        mfence();
    }
    while (i < len && i + offset < len) {
        i = i + offset;
        int read_size_next =  min<int64_t>(buflen, len - i);
        buffer_ptr_x.next().gm_load_async(token_type_ids + i, read_size_next);
        buffer_ptr_y1.next().gm_load_async(text_index + i, read_size_next);
        buffer_ptr_y2.next().gm_load_async(image_index + i, read_size_next);
        do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
        buffer_ptr_y1.gm_store_async(text_index + i - offset, read_size);
        buffer_ptr_y2.gm_store_async(image_index + i - offset, read_size);
        buffer_ptr_x.toggle();
        buffer_ptr_y1.toggle();
        buffer_ptr_y2.toggle();
        read_size = read_size_next;
    }
    if (i < len) {
        do_calc(buffer_ptr_x.ptr, buffer_ptr_y1.ptr, buffer_ptr_y2.ptr, read_size, text_count, images_count);
        buffer_ptr_y1.gm_store_async(text_index + i, read_size);
        buffer_ptr_y2.gm_store(image_index + i, read_size);
    }
 }
 }  // namespace plugin
 }  // namespace xpu3
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/text_image_gather_scatter.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/text_image_gather_scatter.cpp
@@ -0,0 +1,182 @@
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"
 namespace xpu3 {
 namespace plugin {
 template <typename T>
 __attribute__((global)) void text_image_gather_scatter(
    T* input,
    T* text_input,
    T* image_input,
    int* token_type_ids,
    int* text_index,
    int* image_index,
    int64_t token_num,
    int64_t text_token_num,
    int64_t image_token_num,
    int64_t hidden_size,
    bool is_scatter);
 }  // namespace plugin
 }  // namespace xpu3
 namespace baidu {
 namespace xpu {
 namespace api {
 namespace plugin {
 template <typename T>
 static int cpu_wrapper(
    Context* ctx,
    T* input,           // shape [token_num, hidden_size]
    T* text_input,      // shape [text_token_num, hidden_size]
    T* image_input,     // shape [image_token_num, hidden_size]
    int* token_type_ids,// shape [token_num], 0 for text, 1 for image
    int* text_index,    // shape [token_num], mapping from input to text_input
    int* image_index,   // shape [token_num], mapping from input to image_input
    int64_t token_num,
    int64_t text_token_num,
    int64_t image_token_num,
    int64_t hidden_size,
    bool is_scatter) {
  if (is_scatter) {
    // Scatter mode: input -> text_input/image_input
    for (int64_t i = 0; i < token_num; i++) {
      int token_type = token_type_ids[i];
      T* text_image_input = nullptr;
      int* text_image_index = nullptr;
      if (token_type == 0) {
        text_image_input = text_input;
        text_image_index = text_index;
      } else {  // token_type == 1
        text_image_input = image_input;
        text_image_index = image_index;
      }
      int text_image_token_idx = text_image_index[i];
      int input_offset = i * hidden_size;
      int text_image_offset = text_image_token_idx * hidden_size;
      for (int64_t j = 0; j < hidden_size; j++) {
        T value = input[input_offset + j];
        text_image_input[text_image_offset + j] = value;
      }
    }
  } else {
    // Gather mode: text_input/image_input -> input
    for (int64_t i = 0; i < token_num; i++) {
      int token_type = token_type_ids[i];
      T* text_image_input = nullptr;
      int* text_image_index = nullptr;
      if (token_type == 0) {
        text_image_input = text_input;
        text_image_index = text_index;
      } else {  // token_type == 1
        text_image_input = image_input;
        text_image_index = image_index;
      }
      int text_image_token_idx = text_image_index[i];
      int input_offset = i * hidden_size;
      int text_image_offset = text_image_token_idx * hidden_size;
      for (int64_t j = 0; j < hidden_size; j++) {
        T value = text_image_input[text_image_offset + j];
        input[input_offset + j] = value;
      }
    }
  }
  return api::SUCCESS;
 }
 template <typename T>
 static int xpu3_wrapper(
    Context* ctx,
    T* input,
    T* text_input,
    T* image_input,
    int* token_type_ids,
    int* text_index,
    int* image_index,
    int64_t token_num,
    int64_t text_token_num,
    int64_t image_token_num,
    int64_t hidden_size,
    bool is_scatter) {
  xpu3::plugin::text_image_gather_scatter<T> <<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
    input, text_input, image_input, token_type_ids, text_index, image_index,
    token_num, text_token_num, image_token_num, hidden_size, is_scatter
  );
  return api::SUCCESS;
 }
 template <typename T>
 int text_image_gather_scatter(
    Context* ctx,
    T* input,           // shape [token_num, hidden_size]
    T* text_input,      // shape [text_token_num, hidden_size]
    T* image_input,     // shape [image_token_num, hidden_size]
    int* token_type_ids,// shape [token_num], 0 for text, 1 for image
    int* text_index,    // shape [token_num], mapping from input to text_input
    int* image_index,   // shape [token_num], mapping from input to image_input
    int64_t token_num,
    int64_t text_token_num,
    int64_t image_token_num,
    int64_t hidden_size,
    bool is_scatter) {
  WRAPPER_CHECK_CTX(ctx);
  WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_gather_scatter", T);
  WRAPPER_DUMP_PARAM6(ctx, input, text_input, image_input, token_type_ids, text_index, image_index);
  WRAPPER_DUMP_PARAM5(ctx, token_num, text_token_num, image_token_num, hidden_size, is_scatter);
  WRAPPER_DUMP(ctx);
  WRAPPER_CHECK_PTR(ctx, T, token_num * hidden_size, input);
  if (text_token_num != 0) {  // avoiding text_input tensor with shape [0, hidden_size]
    WRAPPER_CHECK_PTR(ctx, T, text_token_num * hidden_size, text_input);
  }
  if (image_token_num != 0) { // avoiding image_input tensor with shape [0, hidden_size]
    WRAPPER_CHECK_PTR(ctx, T, image_token_num * hidden_size, image_input);
  }
  WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
  WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
  WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
  WRAPPER_ASSERT_EQ(ctx, token_num, text_token_num + image_token_num);
  if (ctx->dev().type() == api::kCPU) {
    return cpu_wrapper<T>(
      ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
      token_num, text_token_num, image_token_num, hidden_size, is_scatter
    );
  }
  if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper<T>(
      ctx, input, text_input, image_input, token_type_ids, text_index, image_index,
      token_num, text_token_num, image_token_num, hidden_size, is_scatter
    );
  }
  WRAPPER_UNIMPLEMENTED(ctx);
 }
 template int text_image_gather_scatter(
    Context*, bfloat16*, bfloat16*, bfloat16*, int*, int*, int*, const int64_t, const int64_t, const int64_t, const int64_t, bool);
 }  // namespace plugin
 }  // namespace api
 }  // namespace xpu
 }  // namespace baidu
--- a/custom_ops/xpu_ops/src/plugin/src/wrapper/text_image_index_out.cpp
+++ b/custom_ops/xpu_ops/src/plugin/src/wrapper/text_image_index_out.cpp
@@ -0,0 +1,103 @@
 // Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"
 namespace xpu3 {
 namespace plugin {
 __attribute__((global)) void text_image_index_out_kernel(const int* token_type_ids,   // x
                                                         int* text_index,             // y1
                                                         int* image_index,            // y2
                                                         const int64_t token_num);
 }  // namespace plugin
 }  // namespace xpu3
 namespace baidu {
 namespace xpu {
 namespace api {
 namespace plugin {
 static int cpu_wrapper(Context* ctx,
                       const int* token_type_ids,   // x
                       int* text_index,             // y1
                       int* image_index,            // y2
                       const int64_t token_num) {
  int text_count  = 0;
  int image_count = 0;
  for (int64_t i = 0; i < token_num; ++i) {
      if (token_type_ids[i] == 0) {
          text_index[i] = text_count;
          ++text_count;
      } else {
          image_index[i] = image_count;
          ++image_count;
      }
  }
  return api::SUCCESS;
 }
 static int xpu3_wrapper(Context* ctx,
                        const int* token_type_ids,   // x
                        int* text_index,             // y1
                        int* image_index,            // y2
                        const int64_t token_num) {
  xpu3::plugin::text_image_index_out_kernel<<<1, 1, ctx->xpu_stream>>>(
      token_type_ids,
      text_index,
      image_index,
      token_num);
  return api::SUCCESS;
 }
 int text_image_index_out(Context* ctx,
                         const int* token_type_ids,   // x
                         int* text_index,             // y1
                         int* image_index,            // y2
                         const int64_t token_num) {
  WRAPPER_CHECK_CTX(ctx);
  WRAPPER_DUMP_FUNCTION_T1(ctx, "text_image_index_out", int);
  WRAPPER_DUMP_PARAM4(
      ctx, token_type_ids, text_index, image_index, token_num);
  WRAPPER_DUMP(ctx);
  WRAPPER_ASSERT_GT(ctx, token_num, 0);
  WRAPPER_CHECK_PTR(ctx, int, token_num, token_type_ids);
  WRAPPER_CHECK_PTR(ctx, int, token_num, text_index);
  WRAPPER_CHECK_PTR(ctx, int, token_num, image_index);
  if (ctx->dev().type() == api::kCPU) {
    return cpu_wrapper(ctx,
                       token_type_ids,
                       text_index,
                       image_index,
                       token_num);
  } else if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper(ctx,
                       token_type_ids,
                       text_index,
                       image_index,
                       token_num);
  }
  WRAPPER_UNIMPLEMENTED(ctx);
 }
 }  // namespace plugin
 }  // namespace api
 }  // namespace xpu
 }  // namespace baidu
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -30,6 +30,7 @@ import paddle
 from fastdeploy.engine.request import Request, RequestOutput, RequestStatus, RequestType
 from fastdeploy.engine.resource_manager import ResourceManager
 from fastdeploy.metrics.metrics import main_process_metrics
 from fastdeploy.platforms import current_platform
 from fastdeploy.utils import llm_logger
@@ -157,6 +158,7 @@ class ResourceManagerV1(ResourceManager):
        # TODO: set condition to new _get_num_new_tokens
        num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
        num_new_tokens = min(num_new_tokens, token_budget)
        request.with_image = False
        if not self.config.model_config.enable_mm:
            return num_new_tokens
@@ -219,6 +221,9 @@ class ResourceManagerV1(ResourceManager):
                        grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2))
                grid_thw = paddle.to_tensor(grid_thw, dtype="int64")
                if current_platform.is_xpu():
                    from fastdeploy.model_executor.ops.xpu import get_img_boundaries
                else:
                    from fastdeploy.model_executor.ops.gpu import get_img_boundaries
                request.multimodal_img_boundaries = get_img_boundaries(
--- a/fastdeploy/model_executor/forward_meta.py
+++ b/fastdeploy/model_executor/forward_meta.py
@@ -232,6 +232,8 @@ class XPUForwardMeta(ForwardMeta):
    dec_batch: Optional[paddle.Tensor] = None
    #
    total_enc_len: Optional[paddle.Tensor] = None
    # position embedding type in rope, supports 'NORMAL' or 'HALF_HEAD_DIM'
    pos_emb_type: Optional[str] = "NORMAL"
@dataclass
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -183,5 +183,7 @@ class XPUAttentionBackend(AttentionBackend):
            forward_meta.encoder_batch_map_cpu,
            forward_meta.decoder_context_len_cpu,
            forward_meta.decoder_batch_map_cpu,
            forward_meta.pos_emb_type,
            self.rope_3d,
        )
        return res
--- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -72,7 +72,7 @@ class XPUMoEMethod(UnquantizedFusedMoEMethod):
            layer.top_k,
            False,  # moe group, used in deepseek
        )
-        if layer.tp_size > 1:
+        if layer.reduce_results and layer.tp_size > 1:
            from fastdeploy.distributed.communication import (
                tensor_model_parallel_all_reduce,
            )
@@ -252,7 +252,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
            layer.top_k,
            False,  # moe group, used in deepseek
        )
-        if layer.tp_size > 1:
+        if layer.reduce_results and layer.tp_size > 1:
            from fastdeploy.distributed.communication import (
                tensor_model_parallel_all_reduce,
            )
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -31,6 +31,7 @@ from paddleformers.utils.log import logger
 from fastdeploy.config import FDConfig
 from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.graph_optimization.decorator import (
    cuda_graph_buffers,
    support_graph_optimization,
@@ -44,20 +45,15 @@ from fastdeploy.model_executor.models.ernie4_5_moe import (
    Ernie4_5_Attention,
    Ernie4_5_MLP,
 )
 from fastdeploy.model_executor.models.ernie4_5_vl.image_op import (
    text_image_gather_scatter,
    text_image_index_out,
 )
 from fastdeploy.model_executor.models.model_base import (
    ModelCategory,
    ModelForCasualLM,
    ModelRegistry,
 )
 from fastdeploy.platforms import current_platform
 if current_platform.is_cuda():
    from fastdeploy.model_executor.ops.gpu import (
        text_image_gather_scatter,
        text_image_index_out,
    )
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 class Ernie4_5_VLMLP(Ernie4_5_MLP):
--- a/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py
@@ -0,0 +1,32 @@
 """
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 from fastdeploy.platforms import current_platform
 if current_platform.is_cuda():
    from fastdeploy.model_executor.ops.gpu import (
        text_image_gather_scatter,
        text_image_index_out,
    )
 elif current_platform.is_xpu():
    from fastdeploy.model_executor.ops.xpu import (
        text_image_gather_scatter,
        text_image_index_out,
    )
 else:
    raise ImportError("Unsupported platform, only support CUDA and XPU")
 __all__ = ["text_image_gather_scatter", "text_image_index_out"]
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -25,6 +25,7 @@ from paddle import nn
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request, RequestType
 from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
 from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
 from fastdeploy.model_executor.graph_optimization.utils import (
    profile_run_guard,
@@ -34,10 +35,11 @@ from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
    AttentionBackend,
 )
-from fastdeploy.model_executor.layers.rotary_embedding import get_rope
+from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.sampler import Sampler
 from fastdeploy.model_executor.model_loader import get_model_loader
 from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
 from fastdeploy.model_executor.ops.xpu import (
    adjust_batch,
    get_infer_param,
@@ -201,6 +203,45 @@ def xpu_post_process(
        update_inputs,
    )
    # handle vl:
    if model_output.enable_thinking:
        exists_think_end = sampled_token_ids == model_output.think_end_id
        paddle.assign(
            paddle.where(
                exists_think_end,
                model_output.need_think_end - 1,
                model_output.need_think_end,
            ),
            model_output.need_think_end,
        )
        paddle.assign(
            paddle.where(
                model_output.need_think_end.cast("bool"),
                model_output.reasoning_index - 1,
                model_output.reasoning_index,
            ),
            model_output.reasoning_index,
        )
        stop_wo_think = (
            (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
            | (model_output.reasoning_index == 0)
        ) & (model_output.need_think_end > 0)
        sampled_token_ids = paddle.where(
            stop_wo_think,
            model_output.think_end_id,
            sampled_token_ids,
        )
        paddle.assign(
            paddle.where(
                stop_wo_think,
                model_output.need_think_end - 1,
                model_output.need_think_end,
            ),
            model_output.need_think_end,
        )
    # 1. Set stop value
    paddle.assign(
        paddle.where(
@@ -340,11 +381,36 @@ class XPUModelRunner(ModelRunnerBase):
    def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int):
        super().__init__(fd_config=fd_config, device=device)
        self.enable_mm = self.model_config.enable_mm
        self.rank = rank
        self.local_rank = local_rank
        self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop
        # VL model config:
        if self.enable_mm:
            self._init_image_preprocess()
            self.amp_black = [
                "reduce_sum",
                "c_softmax_with_cross_entropy",
                "elementwise_div",
                "sin",
                "cos",
                "sort",
                "multinomial",
            ]
            self.amp_white = [
                "lookup_table",
                "lookup_table_v2",
                "flash_attn",
                "matmul",
                "matmul_v2",
                "fused_gemm_epilogue",
            ]
        #  Sampler
-        self.sampler = Sampler()
+        #  TODU(lilujia): sync with GPU
        self.sampler = Sampler(fd_config)
        # Lazy initialize kv cache after model loading
        # self.kv_caches: list[paddle.Tensor] = []
@@ -364,18 +430,28 @@ class XPUModelRunner(ModelRunnerBase):
        ).cpu()
        # Initialize attention Backend
-        # Note(gonshaotian): Currently, all attention layers share one attention backend instance.
+        # NOTE(gonshaotian): Currently, all attention layers share one attention backend instance.
        # In the future, we will expand it as a list.
        self.attn_backends: list[AttentionBackend] = []
        self.initialize_attn_backend()
        # Forward meta store the global meta information of the forward
        self.forward_meta: ForwardMeta = None
    def exist_prefill(self):
        """
        check whether prefill stage exist
        """
        if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0:
            return 1
        else:
            return 0
    def insert_tasks_v1(self, req_dicts: List[Request]):
        """
        Process scheduler output tasks, used when ENABLE_V1_KVCACHE_SCHEDULER=1
        req_dict: A list of Request dict
        num_running_requests: batch_size
        """
        # NOTE(luotingdan): Lazy initialize kv cache
        if "caches" not in self.share_inputs:
@@ -388,10 +464,53 @@ class XPUModelRunner(ModelRunnerBase):
            request = req_dicts[i]
            idx = request.idx
            if request.task_type.value == RequestType.PREFILL.value:  # prefill task
                logger.debug(f"Handle prefill request {request} at idx {idx}")
                prefill_start_index = request.prefill_start_index
                prefill_end_index = request.prefill_end_index
                length = prefill_end_index - prefill_start_index
                if self.enable_mm:
                    inputs = request.multimodal_inputs
                    if request.with_image:
                        vision_inputs = {}
                        vision_inputs["input_ids"] = paddle.to_tensor(
                            inputs["input_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
                        )
                        vision_inputs["token_type_ids"] = paddle.to_tensor(
                            inputs["token_type_ids"][prefill_start_index:prefill_end_index], dtype=paddle.int64
                        )
                        vision_inputs["image_type_ids"] = paddle.to_tensor(
                            inputs["image_type_ids"][request.image_type_ids_start : request.image_type_ids_end],
                            dtype=paddle.int64,
                        )
                        vision_inputs["images"] = paddle.to_tensor(
                            inputs["images"][request.image_start : request.image_end], dtype="uint8"
                        )
                        vision_inputs["grid_thw"] = paddle.to_tensor(
                            inputs["grid_thw"][request.num_image_start : request.num_image_end], dtype="int64"
                        )
                        self.share_inputs["image_features"] = self.extract_vision_features(vision_inputs)
                    else:
                        self.share_inputs["image_features"] = None
                    if inputs["position_ids"] is not None:
                        position_ids = paddle.to_tensor(
                            request.multimodal_inputs["position_ids"],
                            dtype="int64",
                        ).unsqueeze([0])
                    else:
                        position_ids = None
                    enable_thinking = request.get("enable_thinking", True)
                    enable_thinking = enable_thinking if enable_thinking is not None else True
                    self.share_inputs["enable_thinking"][:] = enable_thinking
                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                    self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                        position_ids, request.get("max_tokens", 2048)
                    )
                if len(request.output_token_ids) == 0:
                    input_ids = request.prompt_token_ids
                else:
                    input_ids = request.prompt_token_ids + request.output_token_ids
                logger.debug(
                    f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}"
@@ -475,41 +594,86 @@ class XPUModelRunner(ModelRunnerBase):
            if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
                stop_seqs_num = len(request.get("stop_seqs_len"))
                for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num):
-                    request.stop_seqs_len.append(0)
+                    request.sampling_params.stop_seqs_len.append(0)
-                self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32")
+                self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = np.array(
-                self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
+                    request.sampling_params.stop_seqs_len, dtype="int32"
                    request.get("stop_token_ids"), dtype="int64"
                )
                self.share_inputs["stop_seqs"][
                    idx : idx + 1, :stop_seqs_num, : len(request.get("stop_token_ids")[0])
                ] = np.array(request.get("stop_token_ids"), dtype="int64")
            else:
                self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
        if has_prefill_task or has_decode_task:
            self.share_inputs["not_need_stop"][0] = True
-    def process_prefill_inputs(self, req_dicts: List[Request]):
+    def insert_prefill_inputs(self, req_dicts: List[Request]):
        """Process inputs for prefill tasks and update share_inputs buffer"""
        req_len = len(req_dicts)
        for i in range(req_len):
            request = req_dicts[i]
            idx = request.idx
-            length = request.prompt_token_ids_len
+            length = len(request.prompt_token_ids)
            assert length > 0, "The prompt requested must not be empty."
            self.share_inputs["pre_ids"][idx : idx + 1] = -1
            self.share_inputs["step_idx"][idx : idx + 1] = 0
            self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)
            self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids)
            if self.enable_mm:
                inputs = self._preprocess_mm_task(request.multimodal_inputs)
                if inputs.get("images") is not None:
                    self.share_inputs["image_features"] = self.extract_vision_features(inputs)
                else:
                    # Compatible with the situation that lacks images and videos
                    self.share_inputs["image_features"] = None
                position_ids = inputs["position_ids"]
                length = inputs["input_ids"].shape[1]
                self.share_inputs["input_ids"][idx : idx + 1, :length] = inputs["input_ids"]
            else:
                self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
                self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0)
            self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
            self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
            self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
            self.share_inputs["prompt_lens"][idx : idx + 1] = length
            if self.enable_mm:
                enable_thinking = request.get("enable_thinking", True)
                enable_thinking = enable_thinking if enable_thinking is not None else True
                self.share_inputs["enable_thinking"][:] = enable_thinking
                self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
                self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                    position_ids, request.get("max_tokens", 2048)
                )
                self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
            def get_attr_from_request(request, attr, default_value=None):
                res = request.get(attr, default_value)
                if res is not None:
                    return res
                else:
                    return default_value
            assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
            self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
-            self.share_inputs["pre_ids"][idx : idx + 1] = -1
+            self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
            self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
            self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
            self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
            self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
            self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
            self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
            self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
            self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length
            self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length
            self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
            self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
            self.share_inputs["step_idx"][idx : idx + 1] = 0
            self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
            self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
            self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
                request, "repetition_penalty", 1.0
            )
            self.share_inputs["frequency_score"][idx : idx + 1] = get_attr_from_request(
                request, "frequency_penalty", 0.0
            )
            self.share_inputs["presence_score"][idx : idx + 1] = get_attr_from_request(
                request, "presence_penalty", 0.0
            )
            self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
            self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
                "max_tokens", self.model_config.max_model_len
            )
@@ -540,11 +704,15 @@ class XPUModelRunner(ModelRunnerBase):
            if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
                stop_seqs_num = len(request.get("stop_seqs_len"))
                for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num):
-                    request.stop_seqs_len.append(0)
+                    request.sampling_params.stop_seqs_len.append(0)
-                self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32")
+                self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = np.array(
-                self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array(
+                    request.sampling_params.stop_seqs_len, dtype="int32"
                    request.get("stop_token_ids"), dtype="int64"
                )
                self.share_inputs["stop_seqs"][
                    idx : idx + 1, :stop_seqs_num, : len(request.get("stop_token_ids")[0])
                ] = np.array(request.get("stop_token_ids"), dtype="int64")
            else:
                self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
        self.share_inputs["not_need_stop"][0] = True
@@ -565,6 +733,11 @@ class XPUModelRunner(ModelRunnerBase):
            self.model_config.pad_token_id,
            dtype="int64",
        )
        self.share_inputs["prompt_ids"] = paddle.full(
            [max_num_seqs, self.parallel_config.max_model_len],
            self.model_config.pad_token_id,
            dtype="int64",
        )
        self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
        self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
        self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
@@ -627,7 +800,9 @@ class XPUModelRunner(ModelRunnerBase):
        # Initialize rotary position embedding
        tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
        # TODO(gongshaotian): move to models
        if not self.enable_mm:
            self.share_inputs["rope_emb"] = get_rope(
                rotary_dim=self.model_config.head_dim,
                position_ids=tmp_position_ids,
@@ -654,18 +829,40 @@ class XPUModelRunner(ModelRunnerBase):
        self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32")
        # Initialize stop seqs
-        self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32")
+        self.share_inputs["stop_seqs_len"] = paddle.full(
            [max_num_seqs, self.model_config.max_stop_seqs_num], 0, dtype="int32"
        )
        self.share_inputs["stop_seqs"] = paddle.full(
            [
                max_num_seqs,
                self.model_config.max_stop_seqs_num,
                self.model_config.stop_seqs_max_len,
            ],
            -1,
-            dtype="int32",
+            dtype="int64",
        )
        if self.enable_mm:
            head_dim = self.model_config.head_dim
            self.share_inputs["rope_emb"] = paddle.full(
                shape=[
                    max_num_seqs,
                    2,
                    1,
                    self.parallel_config.max_model_len,
                    1,
                    head_dim // 2,
                ],
                fill_value=0,
                dtype="float32",
            )
            self.share_inputs["image_features"] = None
            self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
            self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
            self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
    def _prepare_inputs(self, is_dummy_run=False) -> None:
-        """prepare the model inputs"""
+        """Prepare the model inputs"""
        if envs.ENABLE_V1_KVCACHE_SCHEDULER and not is_dummy_run:
            recover_decode_task(
                self.share_inputs["stop_flags"],
@@ -689,10 +886,13 @@ class XPUModelRunner(ModelRunnerBase):
        # Update bad tokens len
        max_bad_tokens_len = paddle.max(self.share_inputs["bad_tokens_len"])
        if self.enable_mm:  # pos_emb_type is different in EB and VL
            self.forward_meta.pos_emb_type = "HALF_HEAD_DIM"
        self.forward_meta.attn_backend = self.attn_backends[0]
        self.initialize_attention_backend()
        # Get sampling metadata
        # TODU(lilujia): sync with GPU
        self.sampling_metadata = SamplingMetadata(
            temperature=self.share_inputs["temperature"],
            top_p=self.share_inputs["top_p"],
@@ -703,12 +903,16 @@ class XPUModelRunner(ModelRunnerBase):
            seed=self.share_inputs["infer_seed"],
            step_idx=self.share_inputs["step_idx"],
            pre_token_ids=self.share_inputs["pre_ids"],
            prompt_ids=self.share_inputs["prompt_ids"],
            prompt_lens=self.share_inputs["prompt_lens"],
            frequency_penalties=self.share_inputs["frequency_score"],
            presence_penalties=self.share_inputs["presence_score"],
            repetition_penalties=self.share_inputs["penalty_score"],
            min_dec_lens=self.share_inputs["min_dec_len"],
            bad_words_token_ids=self.share_inputs["bad_tokens"][:, :max_bad_tokens_len],
            eos_token_ids=self.share_inputs["eos_token_id"],
            enable_early_stop=self.enable_early_stop,
            stop_flags=self.share_inputs["stop_flags"],
        )
    def load_model(self) -> None:
@@ -723,7 +927,7 @@ class XPUModelRunner(ModelRunnerBase):
        # 3. Load drafter model(for speculative decoding)
    def get_model(self) -> nn.Layer:
-        """get current model"""
+        """Get current model"""
        return self.model
    def initialize_attention_backend(self):
@@ -741,6 +945,7 @@ class XPUModelRunner(ModelRunnerBase):
        cache_kvs = {}
        max_block_num = self.num_gpu_blocks
        # Get kv cache dtype
        cache_type = self.parallel_config.dtype
        kv_cache_quant_type = None
@@ -800,33 +1005,6 @@ class XPUModelRunner(ModelRunnerBase):
            )
        self.attn_backends.append(attn_backend)
    def capture_model(self) -> None:
        """
        Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes'
        """
        logger.warn("XPU not support cuda graph currently")
        pass
    @sot_warmup_guard(True)
    def sot_warmup(self) -> None:
        start_time = time.perf_counter()
        for batch_size in self.sot_warmup_sizes:
            self._dummy_run(
                num_tokens=self.scheduler_config.max_num_batched_tokens,
                batch_size=batch_size,
            )
            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
    def exist_prefill(self):
        """
        check whether prefill stage exist
        """
        if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0:
            return 1
        else:
            return 0
    def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int):
        """Set dummy prefill inputs to share_inputs"""
        full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10)
@@ -838,7 +1016,7 @@ class XPUModelRunner(ModelRunnerBase):
        for i in range(batch_size):
            idx = i
            self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
-
+            self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length)
            self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1)
            self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length
@@ -897,6 +1075,24 @@ class XPUModelRunner(ModelRunnerBase):
        else:
            paddle.device.xpu.set_debug_level(debug_level)
    def capture_model(self) -> None:
        """
        Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes'
        """
        logger.warn("XPU not support cuda graph currently")
        pass
    @sot_warmup_guard(True)
    def sot_warmup(self) -> None:
        start_time = time.perf_counter()
        for batch_size in self.sot_warmup_sizes:
            self._dummy_run(
                num_tokens=self.parallel_config.max_num_batched_tokens,
                batch_size=batch_size,
            )
            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
    def execute_model(
        self,
        model_forward_batch: Optional[List[Request]] = None,
@@ -921,13 +1117,20 @@ class XPUModelRunner(ModelRunnerBase):
        # 2. Padding inputs for cuda grph
        # 3. Execute model
-        model_output = self.model(self.share_inputs["ids_remove_padding"], self.forward_meta)
+        if self.enable_mm:
            model_output = self.model(
                self.share_inputs["ids_remove_padding"], self.share_inputs["image_features"], self.forward_meta
            )
        else:
            model_output = self.model(
                ids_remove_padding=self.share_inputs["ids_remove_padding"],
                forward_meta=self.forward_meta,
            )
-        hiddden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta)
+        hidden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta)
        # 4. Compute logits, Sample
-        logits = self.model.compute_logits(hiddden_states)
+        logits = self.model.compute_logits(hidden_states)
        sampler_output = self.sampler(logits, self.sampling_metadata)
        # 5. Speculative decode
@@ -947,15 +1150,21 @@ class XPUModelRunner(ModelRunnerBase):
            seq_lens_encoder=self.share_inputs["seq_lens_encoder"],
            seq_lens_decoder=self.share_inputs["seq_lens_decoder"],
            is_block_step=self.share_inputs["is_block_step"],
            # 投机解码
            full_hidden_states=None,
            msg_queue_id=self.parallel_config.msg_queue_id,
            mp_rank=self.local_rank,
            use_ep=self.parallel_config.use_ep,
            # 投机解码
            full_hidden_states=None,
            draft_tokens=None,
            actual_draft_token_num=None,
            accept_tokens=None,
            accept_num=None,
            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
            think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
            need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
            reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
            stop_token_ids=self.share_inputs["stop_seqs"],
            stop_seqs_len=self.share_inputs["stop_seqs_len"],
        )
        xpu_post_process(
            sampled_token_ids=sampler_output.sampled_token_ids,
@@ -984,13 +1193,43 @@ class XPUModelRunner(ModelRunnerBase):
    @profile_run_guard(True)
    def profile_run(self) -> None:
-        """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
+        """Execute a forward pass with dummy inputs to profile the memory usage of the model"""
        self.num_gpu_blocks = self.parallel_config.total_block_num
        self.initialize_kv_cache()
        self._dummy_run(
            num_tokens=int(self.scheduler_config.max_num_batched_tokens),
            batch_size=min(self.scheduler_config.max_num_seqs, 1),
        )
    def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
        """
        Set a globally unified block number and update the model's shared input.
        Args:
            num_gpu_blocks:
        """
        self.num_gpu_blocks = num_gpu_blocks
        # Reset block table and kv cache with global block num
        self.initialize_kv_cache()
        # Reset free list
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
                int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
        self.free_list_len = len(free_list)
        self.share_inputs.update(
            {
                "free_list": paddle.to_tensor(free_list, dtype="int32"),
                "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"),
            }
        )
    def clear_block_table(self) -> None:
        """
        Clear the block tables and kv cache after profiling.
@@ -1025,41 +1264,135 @@ class XPUModelRunner(ModelRunnerBase):
            byte_of_dtype = 2
        hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
-        required_memory = (
+        num_layers = self.model_config.num_hidden_layers
-            byte_of_dtype
+        required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers  # k + v
            * 2  # k + v
            * (self.cache_config.block_size * hidden_dim)
            * self.model_config.num_hidden_layers
        )
        return required_memory
    def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
        """
        Set a globally unified block number and update the model's shared input.
        Args:
            num_gpu_blocks:
        """
        self.num_gpu_blocks = num_gpu_blocks
        # Reset block table and kv cache with global block num
        self.initialize_kv_cache()
        # Reset free list
        free_list = list(
            range(
                self.num_gpu_blocks - 1,
                int(self.num_gpu_blocks * self.cache_config.kv_cache_ratio) - 1,
                -1,
            )
        )
        self.free_list_len = len(free_list)
        self.share_inputs.update(
            {
                "free_list": paddle.to_tensor(free_list, dtype="int32"),
                "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"),
            }
        )
    def not_need_stop(self) -> bool:
-        """ """
+        """Stop decoding if the tensor meets the termination condition"""
        return self.share_inputs["not_need_stop"][0]
    def clear_cache(self):
        """Clear cached data from shared inputs and forward metadata"""
        self.share_inputs.pop("caches", None)
        if self.forward_meta is not None:
            self.forward_meta.clear_caches()
    def _init_image_preprocess(self) -> None:
        processor = DataProcessor(
            tokenizer_name=self.model_config.model,
            image_preprocessor_name=str(self.model_config.model),
        )
        processor.eval()
        image_preprocess = processor.image_preprocessor
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
        image_preprocess.image_std_tensor = paddle.to_tensor(image_preprocess.image_std, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
        image_preprocess.rescale_factor = paddle.to_tensor(image_preprocess.rescale_factor, dtype="float32")
        image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze([-2, -1]).repeat_interleave(
            self.model_config.vision_config.patch_size**2 * 1, -1
        )
        image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze([-2, -1]).repeat_interleave(
            self.model_config.vision_config.patch_size**2 * 1, -1
        )
        self.image_preprocess = image_preprocess
    def _preprocess_mm_task(self, one: dict) -> None:
        """process batch"""
        input_ids = one["input_ids"][np.newaxis, :]
        input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64)
        token_type_ids = one["token_type_ids"][np.newaxis, :]
        token_type_ids = paddle.to_tensor(token_type_ids, dtype=paddle.int64)
        if one["images"] is not None:
            image_type_ids = one["image_type_ids"][np.newaxis, :]
            images = one["images"]
            image_type_ids = paddle.to_tensor(image_type_ids, dtype=paddle.int64)
            images = paddle.to_tensor(images, dtype="uint8")
            grid_thw = paddle.to_tensor(one["grid_thw"], dtype="int64")
        else:
            image_type_ids = None
            images = None
            grid_thw = None
        if one["position_ids"] is not None:
            position_ids = paddle.to_tensor(one["position_ids"], dtype="int64").unsqueeze([0])
        else:
            position_ids = None
        result = dict(
            input_ids=input_ids,
            image_type_ids=image_type_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            grid_thw=grid_thw,
            images=images,
        )
        return result
    @paddle.no_grad()
    def extract_vision_features(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
        """extract_vision_features"""
        assert inputs["images"] is not None
        grid_thw = inputs["grid_thw"]
        images = inputs["images"].cast("float32")
        images = self.image_preprocess.rescale_factor * images - self.image_preprocess.image_mean_tensor
        images = images / self.image_preprocess.image_std_tensor
        images = images.cast("bfloat16")
        token_type_ids = inputs["token_type_ids"]
        token_type_ids_w_video = token_type_ids
        input_ids = inputs["input_ids"]
        # convert to img patch id
        # TODO(lulinjun): may need to check model_config and model_cfg
        image_mask = input_ids == self.model_config.im_patch_id
        image_type_ids = inputs["image_type_ids"]
        with paddle.amp.auto_cast(
            True,
            custom_black_list=self.amp_black,
            custom_white_list=self.amp_white,
            level="O2",
            dtype=self.parallel_config.dtype,
        ):
            image_features = self.model.vision_model.extract_feature(images, grid_thw)
            if self.parallel_config.tensor_parallel_size > 1:
                S, C = image_features.shape
                image_features = image_features.reshape([-1, C * self.model_config.spatial_conv_size**2])
                image_features = ScatterOp.apply(image_features, axis=-1)  # mp 切 Fea
                image_features = image_features.reshape([S, -1])
            image_features = self.model.resampler_model(
                image_features,
                image_mask,
                token_type_ids_w_video,
                image_type_ids,
                grid_thw,
            )
        return image_features
    @paddle.no_grad()
    def prepare_rope3d(self, position_ids: paddle.Tensor, max_len: int) -> paddle.Tensor:
        """prepare_rope3d"""
        prefix_max_position_ids = paddle.max(position_ids) + 1
        dec_pos_ids = paddle.tile(
            paddle.arange(max_len, dtype="int64").unsqueeze(0).unsqueeze(-1),
            [1, 1, 3],
        )
        dec_pos_ids = dec_pos_ids + prefix_max_position_ids
        position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids], axis=1)
        rope_emb = get_rope_3d(
            position_ids=position_ids_3d_real,
            rotary_dim=self.model_config.head_dim,
            partial_rotary_factor=1.0,
            base=self.model_config.rope_theta,
            max_position=self.parallel_config.max_model_len,
            freq_allocation=getattr(self.model_config, "freq_allocation", 20),
            model_type=self.model_config.model_type,
        )
        return rope_emb
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -51,12 +51,13 @@ class XpuWorker(WorkerBase):
        """Initialize device and Construct model runner"""
        if paddle.is_compiled_with_xpu():
            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"xpu:{self.local_rank}"
            paddle.device.set_device(self.device)
            paddle.set_default_dtype(self.parallel_config.dtype)
            self.device_ids = self.parallel_config.device_ids.split(",")
            gc.collect()
            paddle.device.xpu.empty_cache()
        else:
            raise RuntimeError(f"Not support device type: {self.device_config.device}")
@@ -69,12 +70,11 @@ class XpuWorker(WorkerBase):
            local_rank=self.local_rank,
        )
-    def graph_optimize_and_warm_up_model(self) -> None:
+    def exist_prefill(self):
        """
-        Perform the warm-up and the graph optimization
+        check whether prefill stage exist
        """
-        if self.model_runner.graph_opt_level >= 1:
+        return self.model_runner.exist_prefill()
            self.model_runner.sot_warmup()
    def determine_available_memory(self) -> int:
        """
@@ -133,20 +133,17 @@ class XpuWorker(WorkerBase):
        paddle.device.xpu.empty_cache()
        return available_kv_cache_memory  # approximate value
    def cal_theortical_kvcache(self) -> int:
        """ """
        return self.model_runner.cal_theortical_kvcache()
    def load_model(self) -> None:
-        """ """
+        """Load model"""
        self.model_runner.load_model()
    def get_model(self) -> nn.Layer:
-        """ """
+        """Get current model"""
        return self.model_runner.get_model()
    def initialize_cache(self, num_gpu_blocks: int) -> None:
-        """ """
+        """Initizlize the KV Cache with accurate num_gpu_blocks"""
        # accurate cache size
        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
    def execute_model(
@@ -158,12 +155,6 @@ class XpuWorker(WorkerBase):
        """ """
        return self.model_runner.execute_model(model_forward_batch, num_running_requests, is_dummy_run)
    def exist_prefill(self):
        """
        check whether prefill stage exist
        """
        return self.model_runner.exist_prefill()
    def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int = -1) -> None:
        """Process new requests and then start the decode loop
        TODO(gongshaotian):The scheduler should schedule the handling of prefill,
@@ -172,8 +163,19 @@ class XpuWorker(WorkerBase):
        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
            self.model_runner.insert_tasks_v1(req_dicts=req_dicts)
        else:
-            self.model_runner.process_prefill_inputs(req_dicts=req_dicts)
+            self.model_runner.insert_prefill_inputs(req_dicts=req_dicts)
    def graph_optimize_and_warm_up_model(self) -> None:
        """
        Perform the warm-up and the graph optimization
        """
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
    def check_health(self) -> bool:
        """ """
        return True
    def cal_theortical_kvcache(self) -> int:
        """Calculate the block memory required"""
        return self.model_runner.cal_theortical_kvcache()