From 45f81b34f0f948c4b5d0acb907b693a50d7d3b52 Mon Sep 17 00:00:00 2001 From: Ryan Date: Fri, 29 Aug 2025 14:56:35 +0800 Subject: [PATCH] add dtype int32 (#3692) --- .../xpu_ops/test/test_speculate_get_output_padding_offset.py | 2 +- fastdeploy/engine/common_engine.py | 2 +- fastdeploy/model_executor/layers/utils.py | 4 ++-- fastdeploy/model_executor/pre_and_post_process.py | 4 ++-- fastdeploy/worker/xpu_model_runner.py | 2 +- tests/layers/test_append_attention.py | 2 +- tests/layers/test_append_attention_with_output.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/custom_ops/xpu_ops/test/test_speculate_get_output_padding_offset.py b/custom_ops/xpu_ops/test/test_speculate_get_output_padding_offset.py index f61de62ad..9822b4d3b 100644 --- a/custom_ops/xpu_ops/test/test_speculate_get_output_padding_offset.py +++ b/custom_ops/xpu_ops/test/test_speculate_get_output_padding_offset.py @@ -30,7 +30,7 @@ def test_speculate_get_output_padding_offset(): seq_lens_output = paddle.to_tensor(seq_lens_output, dtype="int32") out_token_num = paddle.sum(seq_lens_output) - output_cum_offsets_tmp = paddle.cumsum(max_seq_len - seq_lens_output) + output_cum_offsets_tmp = paddle.cumsum(max_seq_len - seq_lens_output, dtype="int32") output_padding_offset_xpu, output_cum_offsets_xpu = speculate_get_output_padding_offset( output_cum_offsets_tmp, out_token_num, seq_lens_output, max_seq_len diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 6081d1e06..13a71eacf 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -397,7 +397,7 @@ class EngineSevice: image_type_ids = paddle.to_tensor(inputs["image_type_ids"], dtype="int32") image_mask = input_ids == self.data_processor.image_patch_id image_token_sum = paddle.full(shape=[len(input_ids) + 1], fill_value=0, dtype="int32") - image_token_sum[1:] = paddle.cumsum(image_mask.cast("int32")) + image_token_sum[1:] = paddle.cumsum(image_mask.cast("int32"), dtype="int32") grid_thw = [] for one in inputs["grid_thw"]: if one[0] == 1: diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index e7a6c0137..18e915715 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -257,7 +257,7 @@ def remove_padding( - The key sequence lengths (paddle.Tensor). """ if current_platform.is_cuda(): - cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") token_num = paddle.sum(seq_lens_this_time) ( ids_remove_padding, @@ -301,7 +301,7 @@ def speculate_remove_padding( - Key sequence lengths (paddle.Tensor). """ if current_platform.is_cuda(): - cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") token_num = paddle.sum(seq_lens_this_time) ( ids_remove_padding, diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 92c43ede4..975174737 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -106,7 +106,7 @@ def pre_process( """ # Remove padding max_len = input_ids.shape[1] - cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") token_num = paddle.sum(seq_lens_this_time) output_padding_offset = None output_cum_offsets = None @@ -132,7 +132,7 @@ def pre_process( if isinstance(seq_lens_output, list): seq_lens_output = seq_lens_output[0] output_token_num = paddle.sum(seq_lens_output) - output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output) + output_cum_offsets_tmp = paddle.cumsum(max_len - seq_lens_output, dtype="int32") output_padding_offset, output_cum_offsets = speculate_get_output_padding_offset( output_cum_offsets_tmp, output_token_num, diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index e933c00f2..cee71415b 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -63,7 +63,7 @@ def xpu_pre_process( ) -> XPUForwardMeta: """ """ max_len = input_ids.shape[1] - cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time, dtype="int32") token_num = paddle.sum(seq_lens_this_time) ( diff --git a/tests/layers/test_append_attention.py b/tests/layers/test_append_attention.py index 5d45c5810..b9fbbf4d6 100644 --- a/tests/layers/test_append_attention.py +++ b/tests/layers/test_append_attention.py @@ -197,7 +197,7 @@ def naive_attention_impl( def get_padding_offset(bsz, max_seq_len, seq_lens_this_time): - cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time, dtype="int32") cum_offsets = paddle.zeros(shape=(bsz + 1), dtype="int32") cum_offsets[1:] = cum_offsets_now token_num = paddle.sum(seq_lens_this_time) diff --git a/tests/layers/test_append_attention_with_output.py b/tests/layers/test_append_attention_with_output.py index 44d51c273..3c6f427cd 100644 --- a/tests/layers/test_append_attention_with_output.py +++ b/tests/layers/test_append_attention_with_output.py @@ -197,7 +197,7 @@ def naive_attention_impl( def get_padding_offset(bsz, max_seq_len, seq_lens_this_time): - cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time) + cum_offsets_now = paddle.cumsum(max_seq_len - seq_lens_this_time, dtype="int32") cum_offsets = paddle.zeros(shape=(bsz + 1), dtype="int32") cum_offsets[1:] = cum_offsets_now token_num = paddle.sum(seq_lens_this_time)