mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Code Simplification] remove cum_offsets (#3410)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
This commit is contained in:
@@ -101,7 +101,6 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
|
||||
cum_offsets_out.data<int>(),
|
||||
seq_length);
|
||||
return {x_remove_padding,
|
||||
cum_offsets_out,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k}; // , enc_token_num, dec_token_num};
|
||||
@@ -114,7 +113,7 @@ std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
|
||||
const std::vector<int64_t> &seq_len_shape) {
|
||||
int64_t bsz = seq_len_shape[0];
|
||||
int64_t seq_len = input_ids_shape[1];
|
||||
return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
|
||||
@@ -123,7 +122,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
|
||||
const paddle::DataType &token_num_dtype,
|
||||
const paddle::DataType &seq_len_dtype) {
|
||||
return {input_ids_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype};
|
||||
@@ -132,7 +130,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
|
||||
PD_BUILD_STATIC_OP(get_padding_offset)
|
||||
.Inputs({"input_ids", "token_num", "cum_offsets", "seq_len"})
|
||||
.Outputs({"x_remove_padding",
|
||||
"cum_offsets_out",
|
||||
"batch_id_per_token",
|
||||
"cu_seqlens_q",
|
||||
"cu_seqlens_k"})
|
||||
|
@@ -17,7 +17,7 @@
|
||||
template <typename T, int VecSize>
|
||||
__global__ void RebuildPaddingKernel(T *output_data,
|
||||
const T *input_data,
|
||||
const int *cum_offsets,
|
||||
const int *cu_seqlens_q,
|
||||
const int *seq_len_this_time,
|
||||
const int *seq_len_decoder,
|
||||
const int *seq_len_encoder,
|
||||
@@ -34,10 +34,10 @@ __global__ void RebuildPaddingKernel(T *output_data,
|
||||
int seq_id = 0;
|
||||
if (seq_len_this_time[bi] == 0) continue;
|
||||
if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
|
||||
// if encoder, get last token; just decoder, get first token.
|
||||
if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
|
||||
|
||||
const int ori_token_idx =
|
||||
bi * max_input_length - cum_offsets[bi] + seq_id;
|
||||
cu_seqlens_q[bi] + seq_id;
|
||||
const int src_offset = ori_token_idx * dim_embed + bias_idx;
|
||||
Load<T, VecSize>(&input_data[src_offset], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &output_data[i]);
|
||||
@@ -47,29 +47,31 @@ __global__ void RebuildPaddingKernel(T *output_data,
|
||||
template <typename T, int VecSize>
|
||||
__global__ void RebuildAppendPaddingKernel(T *output_data,
|
||||
const T *input_data,
|
||||
const int *cum_offset,
|
||||
const int *cu_seqlens_q,
|
||||
const int *seq_len_this_time,
|
||||
const int *seq_len_decoder,
|
||||
const int *seq_len_encoder,
|
||||
const int *output_padding_offset,
|
||||
const int max_input_length,
|
||||
const int dim_embed,
|
||||
const int64_t output_elem_nums) {
|
||||
const int64_t output_elem_nums,
|
||||
const int bsz) {
|
||||
AlignedVector<T, VecSize> src_vec;
|
||||
const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
for (int64_t i = global_idx * VecSize; i < output_elem_nums;
|
||||
i += gridDim.x * blockDim.x * VecSize) {
|
||||
const int out_token_id = i / dim_embed;
|
||||
const int ori_token_id =
|
||||
out_token_id + output_padding_offset[out_token_id];
|
||||
const int ori_token_id = out_token_id + output_padding_offset[out_token_id];
|
||||
|
||||
const int bi = ori_token_id / max_input_length;
|
||||
|
||||
int seq_id = 0;
|
||||
if (seq_len_this_time[bi] == 0) continue;
|
||||
if (seq_len_decoder[bi] == 0 && seq_len_encoder[bi] == 0) continue;
|
||||
// if encoder, get last token; just decoder, get first token.
|
||||
if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
|
||||
|
||||
const int input_token_id = ori_token_id - cum_offset[bi] + seq_id;
|
||||
if (seq_len_encoder[bi] > 0) seq_id = seq_len_encoder[bi] - 1;
|
||||
const int cum_offset_bi = bi * max_input_length - cu_seqlens_q[bi];
|
||||
const int input_token_id = ori_token_id - cum_offset_bi + seq_id;
|
||||
const int bias_idx = i % dim_embed;
|
||||
|
||||
Load<T, VecSize>(&input_data[input_token_id * dim_embed + bias_idx],
|
||||
@@ -78,10 +80,11 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <paddle::DataType D>
|
||||
std::vector<paddle::Tensor> rebuild_padding(
|
||||
const paddle::Tensor &tmp_out, // [token_num, dim_embed]
|
||||
const paddle::Tensor &cum_offsets, // [bsz, 1]
|
||||
const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1]
|
||||
const paddle::Tensor &seq_len_this_time,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
@@ -100,7 +103,7 @@ std::vector<paddle::Tensor> rebuild_padding(
|
||||
std::vector<int64_t> tmp_out_shape = tmp_out.shape();
|
||||
const int token_num = tmp_out_shape[0];
|
||||
const int dim_embed = tmp_out_shape[1];
|
||||
const int bsz = cum_offsets.shape()[0];
|
||||
const int bsz = cu_seqlens_q.shape()[0] - 1;
|
||||
|
||||
paddle::Tensor out;
|
||||
if (output_padding_offset) {
|
||||
@@ -133,21 +136,22 @@ std::vector<paddle::Tensor> rebuild_padding(
|
||||
<<<grid_size, blocksize, 0, cu_stream>>>(
|
||||
reinterpret_cast<DataType_ *>(out.data<data_t>()),
|
||||
reinterpret_cast<const DataType_ *>(tmp_out.data<data_t>()),
|
||||
cum_offsets.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_len_this_time.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
output_padding_offset.get_ptr()->data<int>(),
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
elem_nums,
|
||||
bsz);
|
||||
} else {
|
||||
RebuildPaddingKernel<DataType_, PackSize>
|
||||
<<<grid_size, blocksize, 0, cu_stream>>>(
|
||||
reinterpret_cast<DataType_ *>(out.data<data_t>()),
|
||||
reinterpret_cast<DataType_ *>(
|
||||
const_cast<data_t *>(tmp_out.data<data_t>())),
|
||||
cum_offsets.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_len_this_time.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
@@ -160,7 +164,7 @@ std::vector<paddle::Tensor> rebuild_padding(
|
||||
|
||||
paddle::Tensor RebuildPaddingFunc(
|
||||
const paddle::Tensor &tmp_out, // [token_num, dim_embed]
|
||||
const paddle::Tensor &cum_offsets, // [bsz, 1]
|
||||
const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1]
|
||||
const paddle::Tensor &seq_len_this_time,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
@@ -170,7 +174,7 @@ paddle::Tensor RebuildPaddingFunc(
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
return rebuild_padding<paddle::DataType::BFLOAT16>(
|
||||
tmp_out,
|
||||
cum_offsets,
|
||||
cu_seqlens_q,
|
||||
seq_len_this_time,
|
||||
seq_lens_decoder,
|
||||
seq_lens_encoder,
|
||||
@@ -180,7 +184,7 @@ paddle::Tensor RebuildPaddingFunc(
|
||||
case paddle::DataType::FLOAT16: {
|
||||
return rebuild_padding<paddle::DataType::FLOAT16>(
|
||||
tmp_out,
|
||||
cum_offsets,
|
||||
cu_seqlens_q,
|
||||
seq_len_this_time,
|
||||
seq_lens_decoder,
|
||||
seq_lens_encoder,
|
||||
@@ -190,7 +194,7 @@ paddle::Tensor RebuildPaddingFunc(
|
||||
case paddle::DataType::FLOAT32: {
|
||||
return rebuild_padding<paddle::DataType::FLOAT32>(
|
||||
tmp_out,
|
||||
cum_offsets,
|
||||
cu_seqlens_q,
|
||||
seq_len_this_time,
|
||||
seq_lens_decoder,
|
||||
seq_lens_encoder,
|
||||
@@ -208,14 +212,14 @@ paddle::Tensor RebuildPaddingFunc(
|
||||
|
||||
std::vector<paddle::Tensor> RebuildPadding(
|
||||
const paddle::Tensor &tmp_out, // [token_num, dim_embed]
|
||||
const paddle::Tensor &cum_offsets, // [bsz, 1]
|
||||
const paddle::Tensor &cu_seqlens_q, // [bsz+1, 1]
|
||||
const paddle::Tensor &seq_len_this_time,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::optional<paddle::Tensor> &output_padding_offset,
|
||||
int max_input_length) {
|
||||
return {RebuildPaddingFunc(tmp_out,
|
||||
cum_offsets,
|
||||
cu_seqlens_q,
|
||||
seq_len_this_time,
|
||||
seq_lens_decoder,
|
||||
seq_lens_encoder,
|
||||
@@ -225,7 +229,7 @@ std::vector<paddle::Tensor> RebuildPadding(
|
||||
|
||||
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||
const std::vector<int64_t> &tmp_out_shape,
|
||||
const std::vector<int64_t> &cum_offsets_shape,
|
||||
const std::vector<int64_t> &cu_seqlens_q_shape,
|
||||
const std::vector<int64_t> &seq_len_this_time_shape,
|
||||
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||
@@ -235,14 +239,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||
if (output_padding_offset_shape) {
|
||||
return {{-1, dim_embed}};
|
||||
} else {
|
||||
int64_t bsz = cum_offsets_shape[0];
|
||||
int64_t bsz = cu_seqlens_q_shape[0] - 1;
|
||||
return {{bsz, dim_embed}};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||
const paddle::DataType &tmp_out_dtype,
|
||||
const paddle::DataType &cum_offsets_dtype,
|
||||
const paddle::DataType &cu_seqlens_q_dtype,
|
||||
const paddle::DataType &seq_len_this_time_dtype,
|
||||
const paddle::DataType &seq_lens_decoder_dtype,
|
||||
const paddle::DataType &seq_lens_encoder_dtype,
|
||||
@@ -252,7 +256,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||
|
||||
PD_BUILD_STATIC_OP(rebuild_padding)
|
||||
.Inputs({"tmp_out",
|
||||
"cum_offsets",
|
||||
"cu_seqlens_q",
|
||||
"seq_len_this_time",
|
||||
"seq_lens_decoder",
|
||||
"seq_lens_encoder",
|
||||
|
@@ -106,7 +106,6 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
||||
seq_length,
|
||||
max_draft_tokens);
|
||||
return {x_remove_padding,
|
||||
cum_offsets_out,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k}; // , enc_token_num, dec_token_num};
|
||||
@@ -121,7 +120,7 @@ std::vector<std::vector<int64_t>> SpeculateGetPaddingOffsetInferShape(
|
||||
const std::vector<int64_t>& seq_lens_encoder_shape) {
|
||||
int64_t bsz = seq_len_shape[0];
|
||||
int64_t seq_len = input_ids_shape[1];
|
||||
return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
|
||||
@@ -132,7 +131,6 @@ std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
|
||||
const paddle::DataType& seq_len_dtype,
|
||||
const paddle::DataType& seq_lens_encoder_dtype) {
|
||||
return {input_ids_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype};
|
||||
@@ -141,12 +139,10 @@ std::vector<paddle::DataType> SpeculateGetPaddingOffsetInferDtype(
|
||||
PD_BUILD_STATIC_OP(speculate_get_padding_offset)
|
||||
.Inputs({"input_ids",
|
||||
"draft_tokens",
|
||||
"cum_offsets",
|
||||
"token_num",
|
||||
"seq_len",
|
||||
"seq_lens_encoder"})
|
||||
.Outputs({"x_remove_padding",
|
||||
"cum_offsets_out",
|
||||
"batch_id_per_token",
|
||||
"cu_seqlens_q",
|
||||
"cu_seqlens_k"})
|
||||
|
Reference in New Issue
Block a user