Adapt for iluvatar gpu (#2684)

2025-12-24 13:28:13 +08:00 · 2025-07-07 16:53:14 +08:00
parent 2579e8fea8
commit 1b54a2831e
50 changed files with 4485 additions and 80 deletions
--- a/custom_ops/gpu_ops/get_padding_offset.cu
+++ b/custom_ops/gpu_ops/get_padding_offset.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/extension.h"
+#include "helper.h"

 #ifndef PD_BUILD_STATIC_OP
 #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
@@ -59,7 +60,12 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
                                             const paddle::Tensor &cum_offsets,
                                             const paddle::Tensor &token_num,
                                             const paddle::Tensor &seq_len) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
+    auto cu_stream = dev_ctx->stream();
+#else
    auto cu_stream = input_ids.stream();
+#endif
    std::vector<int64_t> input_ids_shape = input_ids.shape();
    const int bsz = seq_len.shape()[0];
    const int seq_length = input_ids_shape[1];
@@ -75,7 +81,11 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
        paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
    auto cu_seqlens_k =
        paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
-    int blockSize = min((token_num_data + 32 - 1) / 32 * 32, 128);
+#ifdef PADDLE_WITH_COREX
+    int blockSize = std::min((token_num_data + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE, 128);
+#else
+    int blockSize = min((token_num_data + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE, 128);
+#endif
    GetPaddingOffsetKernel<<<bsz, 128, 0, cu_stream>>>(
        padding_offset.data<int>(),
        cum_offsets_out.data<int>(),