[Iluvatar GPU] Adapt VL model (#4313)

2025-12-24 13:28:13 +08:00 · 2025-10-17 16:13:38 +08:00
parent ba5c2b7e37
commit 4b661512ca
15 changed files with 345 additions and 228 deletions
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -1142,13 +1142,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   */
  m.def("recover_decode_task", &RecoverDecodeTask, "recover decode task for scheduler v1 function");

-  /**
-   * extract_text_token_output.cu
-   * extract_text_token_output
-   */
-  m.def("extract_text_token_output", &ExtractTextTokenOutput,
-        "extract_text_token_output function");
-
  m.def("group_swiglu_with_masked", &GroupSwigluWithMasked,
        "group_swiglu_with_masked function");

--- a/custom_ops/gpu_ops/extract_text_token_output.cu
+++ b/custom_ops/gpu_ops/extract_text_token_output.cu
@@ -1,101 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "helper.h"
-
-template <int THREADBLOCK_SIZE>
-__global__ void extract_text_token_output_kernel(int *max_seq_len,
-                                int *max_seq_len_index,
-                                int *mm_token_num_len,
-                                int *seq_lens_this_time,
-                                int *cu_seqlens_q,
-                                float *hidden_states,
-                                float *output,
-                                const int bsz,
-                                const int hidden_size) {
-    int bsz_index = threadIdx.x;
-    int block_idx = blockIdx.x;
-    if (bsz_index >= bsz) return;
-
-    int max_seq_len_data = max_seq_len[0];
-    int max_seq_len_index_data = max_seq_len_index[0];
-    int mm_token_num_len_data = mm_token_num_len[0];
-    int true_bsz = cu_seqlens_q[bsz_index + 1] - 1;
-    if (max_seq_len_data == mm_token_num_len_data && bsz_index == max_seq_len_index_data) {
-        output[bsz_index * hidden_size + block_idx] = 0.0;
-    } else {
-        if (seq_lens_this_time[bsz_index] != 0) {
-            output[bsz_index * hidden_size + block_idx] = hidden_states[true_bsz * hidden_size + block_idx];
-        }
-    }
-    __syncthreads();
-}
-
-std::vector<paddle::Tensor> ExtractTextTokenOutput(
-            const paddle::Tensor& max_seq_len,
-            const paddle::Tensor& max_seq_len_index,
-            const paddle::Tensor& mm_token_num_len,
-            const paddle::Tensor& seq_lens_this_time,
-            const paddle::Tensor& cu_seqlens_q,
-            const paddle::Tensor& hidden_states) {
-
-    const int bsz = seq_lens_this_time.shape()[0];
-    const int hidden_size = hidden_states.shape()[1];
-    paddle::Tensor output = paddle::full({bsz, hidden_size}, 1, paddle::DataType::FLOAT32, hidden_states.place());
-
-    extract_text_token_output_kernel<1024><<<hidden_size, 1024, 0, hidden_states.stream()>>>(
-      const_cast<int*>(max_seq_len.data<int>()),
-      const_cast<int*>(max_seq_len_index.data<int>()),
-      const_cast<int*>(mm_token_num_len.data<int>()),
-      const_cast<int*>(seq_lens_this_time.data<int>()),
-      const_cast<int*>(cu_seqlens_q.data<int>()),
-      const_cast<float*>(hidden_states.data<float>()),
-      output.data<float>(),
-      bsz,
-      hidden_size
-    );
-    return {output};
-}
-
-std::vector<std::vector<int64_t>> ExtractTextTokenOutputInferShape(const std::vector<int64_t>& max_seq_len_shape,
-                                                             const std::vector<int64_t>& max_seq_len_index_shape,
-                                                             const std::vector<int64_t>& mm_token_num_len_shape,
-                                                             const std::vector<int64_t>& seq_lens_this_time_shape,
-                                                             const std::vector<int64_t>& cu_seqlens_q_shape,
-                                                             const std::vector<int64_t>& hidden_states_shape) {
-    const int bsz = seq_lens_this_time_shape[0];
-    const int hidden_size = hidden_states_shape[1];
-    return {{bsz, hidden_size}};
-}
-
-std::vector<paddle::DataType> ExtractTextTokenOutputInferDtype(const paddle::DataType& max_seq_len_dtype,
-                                                         const paddle::DataType& max_seq_len_index_dtype,
-                                                         const paddle::DataType& mm_token_num_len_dtype,
-                                                         const paddle::DataType& seq_lens_this_time_dtype,
-                                                         const paddle::DataType& cu_seqlens_q_dtype,
-                                                         const paddle::DataType& hidden_states_dtype) {
-    return {hidden_states_dtype};
-}
-
-PD_BUILD_STATIC_OP(extract_text_token_output)
-    .Inputs({"max_seq_len",
-             "max_seq_len_index",
-             "mm_token_num_len",
-             "seq_lens_this_time",
-             "cu_seqlens_q",
-             "hidden_states"})
-    .Outputs({"output"})
-    .SetKernelFn(PD_KERNEL(ExtractTextTokenOutput))
-    .SetInferShapeFn(PD_INFER_SHAPE(ExtractTextTokenOutputInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(ExtractTextTokenOutputInferDtype));
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -290,7 +290,6 @@ elif paddle.is_compiled_with_cuda():
        "gpu_ops/cpp_extensions.cc",
        "gpu_ops/share_external_data.cu",
        "gpu_ops/per_token_quant_fp8.cu",
-        "gpu_ops/extract_text_token_output.cu",
        "gpu_ops/update_split_fuse_input.cu",
        "gpu_ops/text_image_index_out.cu",
        "gpu_ops/text_image_gather_scatter.cu",
@@ -538,6 +537,9 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                "gpu_ops/token_penalty_multi_scores.cu",
                "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
                "gpu_ops/sample_kernels/top_k_renorm_probs.cu",
+                "gpu_ops/text_image_index_out.cu",
+                "gpu_ops/text_image_gather_scatter.cu",
+                "gpu_ops/set_data_ipc.cu",
                "iluvatar_ops/moe_dispatch.cu",
                "iluvatar_ops/moe_reduce.cu",
                "iluvatar_ops/paged_attn.cu",
@@ -596,7 +598,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
        "gpu_ops/read_data_ipc.cu",
        "gpu_ops/dequant_int8.cu",
        "gpu_ops/share_external_data.cu",
-        "gpu_ops/extract_text_token_output.cu",
        "gpu_ops/moe/tritonmoe_preprocess.cu",
        "gpu_ops/moe/moe_topk_select.cu",
        "gpu_ops/recover_decode_task.cu",