mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
c++ code format (#4527)
This commit is contained in:
@@ -19,28 +19,28 @@ std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
|
||||
const paddle::Tensor &w_bias,
|
||||
const std::string &alog,
|
||||
bool trans) {
|
||||
auto out_shape = x.shape();
|
||||
out_shape[out_shape.size() - 1] = weight.shape()[1];
|
||||
auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
|
||||
return {out};
|
||||
auto out_shape = x.shape();
|
||||
out_shape[out_shape.size() - 1] = weight.shape()[1];
|
||||
auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
|
||||
std::vector<int64_t> x_shape,
|
||||
std::vector<int64_t> weigh_shape,
|
||||
std::vector<int64_t> weigh_bias_shape) {
|
||||
int m = 1;
|
||||
for (int i = 0; i < x_shape.size() - 1; i++) {
|
||||
m = m * x_shape[i];
|
||||
}
|
||||
return {std::vector<int64_t>{m, weigh_shape[1]}};
|
||||
int m = 1;
|
||||
for (int i = 0; i < x_shape.size() - 1; i++) {
|
||||
m = m * x_shape[i];
|
||||
}
|
||||
return {std::vector<int64_t>{m, weigh_shape[1]}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
|
||||
paddle::DataType x_dtype,
|
||||
paddle::DataType weight_dtype,
|
||||
paddle::DataType weight_bias_dtype) {
|
||||
return {x_dtype};
|
||||
return {x_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(avx_weight_only)
|
||||
|
||||
@@ -20,13 +20,13 @@ void remove_padding(int64_t *output_data,
|
||||
const int *cum_offsets,
|
||||
const int sequence_length,
|
||||
const int bsz) {
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
for (int i = 0; i < seq_lens[bi]; ++i) {
|
||||
const int tgt_seq_id = bi * sequence_length - cum_offsets[bi] + i;
|
||||
const int src_seq_id = bi * sequence_length + i;
|
||||
output_data[tgt_seq_id] = input_data[src_seq_id];
|
||||
}
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
for (int i = 0; i < seq_lens[bi]; ++i) {
|
||||
const int tgt_seq_id = bi * sequence_length - cum_offsets[bi] + i;
|
||||
const int src_seq_id = bi * sequence_length + i;
|
||||
output_data[tgt_seq_id] = input_data[src_seq_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void get_padding_offset_kernel(int *padding_offset,
|
||||
@@ -37,56 +37,53 @@ void get_padding_offset_kernel(int *padding_offset,
|
||||
const int *seq_lens,
|
||||
const int max_seq_len,
|
||||
const int bsz) {
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
int cum_offset = bi == 0 ? 0 : cum_offsets[bi - 1];
|
||||
auto seq_len_now = seq_lens[bi];
|
||||
for (int i = 0; i < seq_len_now; ++i) {
|
||||
padding_offset[bi * max_seq_len - cum_offset + i] = cum_offset;
|
||||
}
|
||||
cum_offsets_out[bi] = cum_offset;
|
||||
int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets[bi];
|
||||
cu_seqlens_q[bi + 1] = cum_seq_len;
|
||||
cu_seqlens_k[bi + 1] = cum_seq_len;
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
int cum_offset = bi == 0 ? 0 : cum_offsets[bi - 1];
|
||||
auto seq_len_now = seq_lens[bi];
|
||||
for (int i = 0; i < seq_len_now; ++i) {
|
||||
padding_offset[bi * max_seq_len - cum_offset + i] = cum_offset;
|
||||
}
|
||||
cum_offsets_out[bi] = cum_offset;
|
||||
int cum_seq_len = (bi + 1) * max_seq_len - cum_offsets[bi];
|
||||
cu_seqlens_q[bi + 1] = cum_seq_len;
|
||||
cu_seqlens_k[bi + 1] = cum_seq_len;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
|
||||
const paddle::Tensor &cum_offsets,
|
||||
const paddle::Tensor &token_num,
|
||||
const paddle::Tensor &seq_len) {
|
||||
std::vector<int64_t> input_ids_shape = input_ids.shape();
|
||||
const int bsz = seq_len.shape()[0];
|
||||
const int seq_length = input_ids_shape[1];
|
||||
auto cum_offsets_out = cum_offsets.copy_to(paddle::CPUPlace(), false);
|
||||
auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
|
||||
std::vector<int64_t> input_ids_shape = input_ids.shape();
|
||||
const int bsz = seq_len.shape()[0];
|
||||
const int seq_length = input_ids_shape[1];
|
||||
auto cum_offsets_out = cum_offsets.copy_to(paddle::CPUPlace(), false);
|
||||
auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
|
||||
|
||||
const int token_num_data = cpu_token_num.data<int64_t>()[0];
|
||||
auto x_remove_padding = paddle::empty(
|
||||
{token_num_data}, paddle::DataType::INT64, input_ids.place());
|
||||
auto padding_offset = paddle::empty(
|
||||
{token_num_data}, paddle::DataType::INT32, input_ids.place());
|
||||
auto cu_seqlens_q =
|
||||
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
|
||||
auto cu_seqlens_k =
|
||||
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
|
||||
get_padding_offset_kernel(padding_offset.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
cu_seqlens_k.data<int>(),
|
||||
cum_offsets.data<int>(),
|
||||
seq_len.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
remove_padding(x_remove_padding.data<int64_t>(),
|
||||
input_ids.data<int64_t>(),
|
||||
seq_len.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
return {x_remove_padding,
|
||||
padding_offset,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k};
|
||||
const int token_num_data = cpu_token_num.data<int64_t>()[0];
|
||||
auto x_remove_padding = paddle::empty(
|
||||
{token_num_data}, paddle::DataType::INT64, input_ids.place());
|
||||
auto padding_offset = paddle::empty(
|
||||
{token_num_data}, paddle::DataType::INT32, input_ids.place());
|
||||
auto cu_seqlens_q =
|
||||
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
|
||||
auto cu_seqlens_k =
|
||||
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
|
||||
get_padding_offset_kernel(padding_offset.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
cu_seqlens_k.data<int>(),
|
||||
cum_offsets.data<int>(),
|
||||
seq_len.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
remove_padding(x_remove_padding.data<int64_t>(),
|
||||
input_ids.data<int64_t>(),
|
||||
seq_len.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
return {x_remove_padding, padding_offset, cu_seqlens_q, cu_seqlens_k};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
|
||||
@@ -94,9 +91,9 @@ std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
|
||||
const std::vector<int64_t> &cum_offsets_shape,
|
||||
const std::vector<int64_t> &token_num_shape,
|
||||
const std::vector<int64_t> &seq_len_shape) {
|
||||
int64_t bsz = seq_len_shape[0];
|
||||
int64_t seq_len = input_ids_shape[1];
|
||||
return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
int64_t bsz = seq_len_shape[0];
|
||||
int64_t seq_len = input_ids_shape[1];
|
||||
return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
|
||||
@@ -104,18 +101,13 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
|
||||
const paddle::DataType &cum_offsets_dtype,
|
||||
const paddle::DataType &token_num_dtype,
|
||||
const paddle::DataType &seq_len_dtype) {
|
||||
return {input_ids_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype,
|
||||
seq_len_dtype};
|
||||
return {input_ids_dtype, seq_len_dtype, seq_len_dtype, seq_len_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(get_padding_offset_cpu)
|
||||
.Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"})
|
||||
.Outputs({"x_remove_padding",
|
||||
"padding_offset",
|
||||
"cu_seqlens_q",
|
||||
"cu_seqlens_k"})
|
||||
.Outputs(
|
||||
{"x_remove_padding", "padding_offset", "cu_seqlens_q", "cu_seqlens_k"})
|
||||
.SetKernelFn(PD_KERNEL(GetPaddingOffset))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(GetPaddingOffsetInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(GetPaddingOffsetInferDtype));
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
|
||||
#endif
|
||||
|
||||
|
||||
template <typename T>
|
||||
void RebuildPaddingCPUImpl(T *output_data,
|
||||
const T *input_data,
|
||||
@@ -30,27 +29,27 @@ void RebuildPaddingCPUImpl(T *output_data,
|
||||
int max_input_length,
|
||||
int dim_embed,
|
||||
const int elem_nums) {
|
||||
for (int i = 0; i < elem_nums; ++i) {
|
||||
const int bi = i / dim_embed;
|
||||
const int bias_idx = i % dim_embed;
|
||||
int seq_id = 0;
|
||||
for (int i = 0; i < elem_nums; ++i) {
|
||||
const int bi = i / dim_embed;
|
||||
const int bias_idx = i % dim_embed;
|
||||
int seq_id = 0;
|
||||
|
||||
if (seq_len_this_time_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
|
||||
const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id;
|
||||
const int src_offset = ori_token_idx * dim_embed + bias_idx;
|
||||
|
||||
output_data[i] = input_data[src_offset];
|
||||
if (seq_len_this_time_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
|
||||
const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id;
|
||||
const int src_offset = ori_token_idx * dim_embed + bias_idx;
|
||||
|
||||
output_data[i] = input_data[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -64,27 +63,25 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
|
||||
const int max_input_length,
|
||||
const int dim_embed,
|
||||
const int64_t output_elem_nums) {
|
||||
for (int i = 0; i < output_elem_nums; ++i) {
|
||||
int out_token_id = i / dim_embed;
|
||||
int ori_token_id =
|
||||
out_token_id + output_padding_offset_data[out_token_id];
|
||||
int bi = ori_token_id / max_input_length;
|
||||
if (seq_len_this_time_data[bi] == 0 ||
|
||||
(seq_lens_decoder_data[bi] == 0 &&
|
||||
seq_lens_encoder_data[bi] == 0)) {
|
||||
continue;
|
||||
}
|
||||
int seq_id = 0;
|
||||
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
int input_token_id = cu_seqlens_q_data[bi] + seq_id;
|
||||
int bias_idx = i % dim_embed;
|
||||
int src_offset = input_token_id * dim_embed + bias_idx;
|
||||
|
||||
output_data[i] = input_data[src_offset];
|
||||
for (int i = 0; i < output_elem_nums; ++i) {
|
||||
int out_token_id = i / dim_embed;
|
||||
int ori_token_id = out_token_id + output_padding_offset_data[out_token_id];
|
||||
int bi = ori_token_id / max_input_length;
|
||||
if (seq_len_this_time_data[bi] == 0 ||
|
||||
(seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0)) {
|
||||
continue;
|
||||
}
|
||||
int seq_id = 0;
|
||||
|
||||
if (seq_lens_encoder_data[bi] > 0) {
|
||||
seq_id = seq_lens_encoder_data[bi] - 1;
|
||||
}
|
||||
int input_token_id = cu_seqlens_q_data[bi] + seq_id;
|
||||
int bias_idx = i % dim_embed;
|
||||
int src_offset = input_token_id * dim_embed + bias_idx;
|
||||
|
||||
output_data[i] = input_data[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> RebuildPaddingCPU(
|
||||
@@ -95,140 +92,139 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::optional<paddle::Tensor> &output_padding_offset,
|
||||
int max_input_length) {
|
||||
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
|
||||
auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_len_this_time_cpu =
|
||||
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_decoder_cpu =
|
||||
seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_encoder_cpu =
|
||||
seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
|
||||
paddle::optional<paddle::Tensor> output_padding_offset_cpu;
|
||||
if (output_padding_offset) {
|
||||
output_padding_offset_cpu =
|
||||
output_padding_offset->copy_to(paddle::CPUPlace(), true);
|
||||
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
|
||||
auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_len_this_time_cpu =
|
||||
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_decoder_cpu =
|
||||
seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
|
||||
auto seq_lens_encoder_cpu =
|
||||
seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
|
||||
paddle::optional<paddle::Tensor> output_padding_offset_cpu;
|
||||
if (output_padding_offset) {
|
||||
output_padding_offset_cpu =
|
||||
output_padding_offset->copy_to(paddle::CPUPlace(), true);
|
||||
}
|
||||
|
||||
int token_num = tmp_out_cpu.shape()[0];
|
||||
int dim_embed = tmp_out_cpu.shape()[1];
|
||||
int bsz = cu_seqlens_q_cpu.shape()[0] - 1;
|
||||
|
||||
paddle::Tensor out;
|
||||
if (output_padding_offset_cpu) {
|
||||
int need_delete_token_num = 0;
|
||||
for (int i = 0; i < bsz; ++i) {
|
||||
if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
|
||||
need_delete_token_num += seq_lens_encoder_cpu.data<int>()[i] - 1;
|
||||
}
|
||||
}
|
||||
int output_token_num = token_num - need_delete_token_num;
|
||||
out = paddle::full({output_token_num, dim_embed},
|
||||
0,
|
||||
tmp_out_cpu.dtype(),
|
||||
paddle::CPUPlace());
|
||||
} else {
|
||||
out = paddle::full(
|
||||
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
|
||||
}
|
||||
|
||||
int token_num = tmp_out_cpu.shape()[0];
|
||||
int dim_embed = tmp_out_cpu.shape()[1];
|
||||
int bsz = cu_seqlens_q_cpu.shape()[0] - 1;
|
||||
const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();
|
||||
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
|
||||
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
|
||||
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
|
||||
int elem_nums = out.numel();
|
||||
|
||||
paddle::Tensor out;
|
||||
if (output_padding_offset_cpu) {
|
||||
int need_delete_token_num = 0;
|
||||
for (int i = 0; i < bsz; ++i) {
|
||||
if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
|
||||
need_delete_token_num +=
|
||||
seq_lens_encoder_cpu.data<int>()[i] - 1;
|
||||
}
|
||||
}
|
||||
int output_token_num = token_num - need_delete_token_num;
|
||||
out = paddle::full({output_token_num, dim_embed},
|
||||
0,
|
||||
tmp_out_cpu.dtype(),
|
||||
paddle::CPUPlace());
|
||||
} else {
|
||||
out = paddle::full(
|
||||
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
|
||||
if (output_padding_offset_cpu) {
|
||||
const int *output_padding_offset_data =
|
||||
output_padding_offset_cpu->data<int>();
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
|
||||
const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();
|
||||
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
|
||||
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
|
||||
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
|
||||
int elem_nums = out.numel();
|
||||
|
||||
if (output_padding_offset_cpu) {
|
||||
const int *output_padding_offset_data =
|
||||
output_padding_offset_cpu->data<int>();
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
output_padding_offset_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
} else {
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
RebuildPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
} else {
|
||||
switch (tmp_out_cpu.dtype()) {
|
||||
case paddle::DataType::FLOAT32:
|
||||
RebuildPaddingCPUImpl<float>(out.data<float>(),
|
||||
tmp_out_cpu.data<float>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::FLOAT16:
|
||||
RebuildPaddingCPUImpl<paddle::float16>(
|
||||
out.data<paddle::float16>(),
|
||||
tmp_out_cpu.data<paddle::float16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
case paddle::DataType::BFLOAT16:
|
||||
RebuildPaddingCPUImpl<paddle::bfloat16>(
|
||||
out.data<paddle::bfloat16>(),
|
||||
tmp_out_cpu.data<paddle::bfloat16>(),
|
||||
cu_seqlens_q_data,
|
||||
seq_len_this_time_data,
|
||||
seq_lens_decoder_data,
|
||||
seq_lens_encoder_data,
|
||||
max_input_length,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
break;
|
||||
default:
|
||||
PD_THROW(
|
||||
"Unsupported data type for rebuild_padding_cpu. "
|
||||
"Only float32, float16, and bfloat16 are supported.");
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||
@@ -238,13 +234,13 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
|
||||
const std::vector<int64_t> &seq_lens_decoder_shape,
|
||||
const std::vector<int64_t> &seq_lens_encoder_shape,
|
||||
const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
|
||||
int64_t dim_embed = tmp_out_shape[1];
|
||||
if (output_padding_offset_shape) {
|
||||
return {{-1, dim_embed}};
|
||||
} else {
|
||||
int64_t bsz = cu_seqlens_q_shape[0] - 1;
|
||||
return {{bsz, dim_embed}};
|
||||
}
|
||||
int64_t dim_embed = tmp_out_shape[1];
|
||||
if (output_padding_offset_shape) {
|
||||
return {{-1, dim_embed}};
|
||||
} else {
|
||||
int64_t bsz = cu_seqlens_q_shape[0] - 1;
|
||||
return {{bsz, dim_embed}};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||
@@ -254,7 +250,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
|
||||
const paddle::DataType &seq_lens_decoder_dtype,
|
||||
const paddle::DataType &seq_lens_encoder_dtype,
|
||||
const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
|
||||
return {tmp_out_dtype};
|
||||
return {tmp_out_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(rebuild_padding_cpu)
|
||||
|
||||
@@ -15,27 +15,27 @@
|
||||
#include "paddle/extension.h"
|
||||
|
||||
void set_value_by_flags_and_idx(const bool *stop_flags,
|
||||
int64_t *pre_ids_all,
|
||||
const int64_t *input_ids,
|
||||
const int *seq_lens_encoder,
|
||||
const int *seq_lens_decoder,
|
||||
const int64_t *step_idx,
|
||||
int bs,
|
||||
int length,
|
||||
int length_input_ids) {
|
||||
for (int bi = 0; bi < bs; bi++) {
|
||||
if (!stop_flags[bi]) {
|
||||
const int seq_len_dec = seq_lens_decoder[bi];
|
||||
const int seq_len_enc = seq_lens_encoder[bi];
|
||||
int64_t *pre_ids_all_now = pre_ids_all + bi * length;
|
||||
const int64_t *input_ids_now = input_ids + bi * length_input_ids;
|
||||
if (seq_len_dec == 0) {
|
||||
pre_ids_all_now[step_idx[bi]] = input_ids_now[seq_len_enc - 1];
|
||||
} else {
|
||||
pre_ids_all_now[step_idx[bi]] = input_ids_now[0];
|
||||
}
|
||||
}
|
||||
int64_t *pre_ids_all,
|
||||
const int64_t *input_ids,
|
||||
const int *seq_lens_encoder,
|
||||
const int *seq_lens_decoder,
|
||||
const int64_t *step_idx,
|
||||
int bs,
|
||||
int length,
|
||||
int length_input_ids) {
|
||||
for (int bi = 0; bi < bs; bi++) {
|
||||
if (!stop_flags[bi]) {
|
||||
const int seq_len_dec = seq_lens_decoder[bi];
|
||||
const int seq_len_enc = seq_lens_encoder[bi];
|
||||
int64_t *pre_ids_all_now = pre_ids_all + bi * length;
|
||||
const int64_t *input_ids_now = input_ids + bi * length_input_ids;
|
||||
if (seq_len_dec == 0) {
|
||||
pre_ids_all_now[step_idx[bi]] = input_ids_now[seq_len_enc - 1];
|
||||
} else {
|
||||
pre_ids_all_now[step_idx[bi]] = input_ids_now[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
|
||||
@@ -45,12 +45,12 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &step_idx,
|
||||
const paddle::Tensor &stop_flags) {
|
||||
std::vector<int64_t> pre_ids_all_shape = pre_ids_all.shape();
|
||||
int bs = seq_lens_this_time.shape()[0];
|
||||
int length = pre_ids_all_shape[1];
|
||||
int length_input_ids = input_ids.shape()[1];
|
||||
std::vector<int64_t> pre_ids_all_shape = pre_ids_all.shape();
|
||||
int bs = seq_lens_this_time.shape()[0];
|
||||
int length = pre_ids_all_shape[1];
|
||||
int length_input_ids = input_ids.shape()[1];
|
||||
|
||||
set_value_by_flags_and_idx(stop_flags.data<bool>(),
|
||||
set_value_by_flags_and_idx(stop_flags.data<bool>(),
|
||||
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
|
||||
input_ids.data<int64_t>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
|
||||
@@ -21,45 +21,45 @@ void probs_sort(const float *probs,
|
||||
float *ProbsVals,
|
||||
int vocab_size,
|
||||
int bsz) {
|
||||
float cursum = 0;
|
||||
std::vector<int64_t> elementsIds(vocab_size);
|
||||
std::vector<float> elementsProbs(vocab_size);
|
||||
float cursum = 0;
|
||||
std::vector<int64_t> elementsIds(vocab_size);
|
||||
std::vector<float> elementsProbs(vocab_size);
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < vocab_size; j++) {
|
||||
elementsIds[j] = j;
|
||||
elementsProbs[j] = probs[j];
|
||||
}
|
||||
x86simdsortStatic::keyvalue_qsort(
|
||||
elementsProbs.data(), elementsIds.data(), vocab_size, false, true);
|
||||
for (int j = 0; j < vocab_size; j++) {
|
||||
elementsIds[j] = j;
|
||||
elementsProbs[j] = probs[j];
|
||||
}
|
||||
x86simdsortStatic::keyvalue_qsort(
|
||||
elementsProbs.data(), elementsIds.data(), vocab_size, false, true);
|
||||
#pragma omp parallel for
|
||||
for (int j = 0; j < vocab_size; ++j) {
|
||||
ProbsVals[j] = elementsProbs[j];
|
||||
ProbsIds[j] = elementsIds[j];
|
||||
}
|
||||
for (int j = 0; j < vocab_size; ++j) {
|
||||
ProbsVals[j] = elementsProbs[j];
|
||||
ProbsIds[j] = elementsIds[j];
|
||||
}
|
||||
}
|
||||
std::vector<paddle::Tensor> SimdSort(const paddle::Tensor &probs) {
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto sorted_indices = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::INT64, probs.place());
|
||||
auto sorted_probs = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::FLOAT32, probs.place());
|
||||
probs_sort(probs.data<float>(),
|
||||
const_cast<int64_t *>(sorted_indices.data<int64_t>()),
|
||||
const_cast<float *>(sorted_probs.data<float>()),
|
||||
vocab_size,
|
||||
bsz);
|
||||
return {sorted_indices, sorted_probs};
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto sorted_indices =
|
||||
paddle::empty({bsz, vocab_size}, paddle::DataType::INT64, probs.place());
|
||||
auto sorted_probs = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::FLOAT32, probs.place());
|
||||
probs_sort(probs.data<float>(),
|
||||
const_cast<int64_t *>(sorted_indices.data<int64_t>()),
|
||||
const_cast<float *>(sorted_probs.data<float>()),
|
||||
vocab_size,
|
||||
bsz);
|
||||
return {sorted_indices, sorted_probs};
|
||||
}
|
||||
std::vector<std::vector<int64_t>> SimdSortInferShape(
|
||||
const std::vector<int64_t> &probs_shape) {
|
||||
int64_t bsz = probs_shape[0];
|
||||
int64_t vocab_size = probs_shape[1];
|
||||
return {{bsz, vocab_size}, {bsz, vocab_size}};
|
||||
int64_t bsz = probs_shape[0];
|
||||
int64_t vocab_size = probs_shape[1];
|
||||
return {{bsz, vocab_size}, {bsz, vocab_size}};
|
||||
}
|
||||
std::vector<paddle::DataType> SimdSortInferDtype(
|
||||
const paddle::DataType &probs_dtype) {
|
||||
return {paddle::DataType::INT64, paddle::DataType::FLOAT32};
|
||||
return {paddle::DataType::INT64, paddle::DataType::FLOAT32};
|
||||
}
|
||||
PD_BUILD_STATIC_OP(simd_sort)
|
||||
.Inputs({"probs"})
|
||||
|
||||
@@ -16,23 +16,23 @@
|
||||
#include "paddle/extension.h"
|
||||
|
||||
std::vector<paddle::Tensor> SimdSort(const paddle::Tensor &probs) {
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto sorted_indices = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::INT64, probs.place());
|
||||
auto sorted_probs = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::FLOAT32, probs.place());
|
||||
return {sorted_indices, sorted_probs};
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto sorted_indices =
|
||||
paddle::empty({bsz, vocab_size}, paddle::DataType::INT64, probs.place());
|
||||
auto sorted_probs = paddle::empty(
|
||||
{bsz, vocab_size}, paddle::DataType::FLOAT32, probs.place());
|
||||
return {sorted_indices, sorted_probs};
|
||||
}
|
||||
std::vector<std::vector<int64_t>> SimdSortInferShape(
|
||||
const std::vector<int64_t> &probs_shape) {
|
||||
int64_t bsz = probs_shape[0];
|
||||
int64_t vocab_size = probs_shape[1];
|
||||
return {{bsz, vocab_size}, {bsz, vocab_size}};
|
||||
int64_t bsz = probs_shape[0];
|
||||
int64_t vocab_size = probs_shape[1];
|
||||
return {{bsz, vocab_size}, {bsz, vocab_size}};
|
||||
}
|
||||
std::vector<paddle::DataType> SimdSortInferDtype(
|
||||
const paddle::DataType &probs_dtype) {
|
||||
return {paddle::DataType::INT64, paddle::DataType::FLOAT32};
|
||||
return {paddle::DataType::INT64, paddle::DataType::FLOAT32};
|
||||
}
|
||||
PD_BUILD_STATIC_OP(simd_sort)
|
||||
.Inputs({"probs"})
|
||||
|
||||
@@ -23,13 +23,13 @@
|
||||
#endif
|
||||
|
||||
bool is_in_end(const int64_t id, const int64_t *end_ids, int length) {
|
||||
bool flag = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (id == end_ids[i]) {
|
||||
return true;
|
||||
}
|
||||
bool flag = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (id == end_ids[i]) {
|
||||
return true;
|
||||
}
|
||||
return flag;
|
||||
}
|
||||
return flag;
|
||||
}
|
||||
|
||||
void set_value_by_flags(bool *stop_flags,
|
||||
@@ -40,23 +40,23 @@ void set_value_by_flags(bool *stop_flags,
|
||||
const int bs,
|
||||
const int end_length,
|
||||
bool beam_search) {
|
||||
for (int bi = 0; bi < bs; bi++) {
|
||||
if (stop_flags[bi]) {
|
||||
if ((seq_lens[bi] == 0)) {
|
||||
topk_ids[bi] = -1;
|
||||
} else {
|
||||
topk_ids[bi] = end_ids[0];
|
||||
next_tokens[bi] = end_ids[0];
|
||||
}
|
||||
} else {
|
||||
next_tokens[bi] = topk_ids[bi];
|
||||
}
|
||||
if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) {
|
||||
stop_flags[bi] = true;
|
||||
topk_ids[bi] = end_ids[0];
|
||||
next_tokens[bi] = end_ids[0];
|
||||
}
|
||||
for (int bi = 0; bi < bs; bi++) {
|
||||
if (stop_flags[bi]) {
|
||||
if ((seq_lens[bi] == 0)) {
|
||||
topk_ids[bi] = -1;
|
||||
} else {
|
||||
topk_ids[bi] = end_ids[0];
|
||||
next_tokens[bi] = end_ids[0];
|
||||
}
|
||||
} else {
|
||||
next_tokens[bi] = topk_ids[bi];
|
||||
}
|
||||
if (!beam_search && is_in_end(topk_ids[bi], end_ids, end_length)) {
|
||||
stop_flags[bi] = true;
|
||||
topk_ids[bi] = end_ids[0];
|
||||
next_tokens[bi] = end_ids[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
|
||||
@@ -65,17 +65,17 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
|
||||
const paddle::Tensor &end_ids,
|
||||
const paddle::Tensor &next_tokens,
|
||||
const bool beam_search) {
|
||||
std::vector<int64_t> shape = topk_ids.shape();
|
||||
int64_t bs_now = shape[0];
|
||||
int64_t end_length = end_ids.shape()[0];
|
||||
set_value_by_flags(const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int64_t *>(topk_ids.data<int64_t>()),
|
||||
const_cast<int64_t *>(next_tokens.data<int64_t>()),
|
||||
end_ids.data<int64_t>(),
|
||||
seq_lens.data<int>(),
|
||||
bs_now,
|
||||
end_length,
|
||||
false);
|
||||
std::vector<int64_t> shape = topk_ids.shape();
|
||||
int64_t bs_now = shape[0];
|
||||
int64_t end_length = end_ids.shape()[0];
|
||||
set_value_by_flags(const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int64_t *>(topk_ids.data<int64_t>()),
|
||||
const_cast<int64_t *>(next_tokens.data<int64_t>()),
|
||||
end_ids.data<int64_t>(),
|
||||
seq_lens.data<int>(),
|
||||
bs_now,
|
||||
end_length,
|
||||
false);
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(set_stop_value_multi_ends_cpu)
|
||||
|
||||
@@ -23,16 +23,16 @@ void min_length_logits_process(float *logits,
|
||||
const int64_t bs,
|
||||
const int64_t length,
|
||||
const int64_t end_length) {
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
if (cur_len[bi] < 0) {
|
||||
continue;
|
||||
}
|
||||
if (cur_len[bi] < min_len[bi]) {
|
||||
for (int i = 0; i < end_length; ++i) {
|
||||
logits[bi * length + eos_token_id[i]] = -1e10;
|
||||
}
|
||||
}
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
if (cur_len[bi] < 0) {
|
||||
continue;
|
||||
}
|
||||
if (cur_len[bi] < min_len[bi]) {
|
||||
for (int i = 0; i < end_length; ++i) {
|
||||
logits[bi * length + eos_token_id[i]] = -1e10;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void update_repeat_times(const int64_t *pre_ids,
|
||||
@@ -41,20 +41,20 @@ void update_repeat_times(const int64_t *pre_ids,
|
||||
const int64_t bs,
|
||||
const int64_t length,
|
||||
const int64_t length_id) {
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
if (cur_len[bi] < 0) {
|
||||
continue;
|
||||
}
|
||||
const int64_t *pre_ids_now = pre_ids + bi * length_id;
|
||||
int *repeat_times_now = repeat_times + bi * length;
|
||||
for (int i = 0; i < length_id; i++) {
|
||||
int64_t id = pre_ids_now[i];
|
||||
if (id < 0) {
|
||||
break;
|
||||
}
|
||||
repeat_times_now[id] += 1;
|
||||
}
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
if (cur_len[bi] < 0) {
|
||||
continue;
|
||||
}
|
||||
const int64_t *pre_ids_now = pre_ids + bi * length_id;
|
||||
int *repeat_times_now = repeat_times + bi * length;
|
||||
for (int i = 0; i < length_id; i++) {
|
||||
int64_t id = pre_ids_now[i];
|
||||
if (id < 0) {
|
||||
break;
|
||||
}
|
||||
repeat_times_now[id] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void update_value_by_repeat_times(const int *repeat_times,
|
||||
@@ -65,24 +65,22 @@ void update_value_by_repeat_times(const int *repeat_times,
|
||||
float *logits,
|
||||
const int64_t bs,
|
||||
const int64_t length) {
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
float *logits_now = logits + bi * length;
|
||||
const int *repeat_times_now = repeat_times + bi * length;
|
||||
float alpha = static_cast<float>(penalty_scores[bi]);
|
||||
float beta = static_cast<float>(frequency_score[bi]);
|
||||
float gamma = static_cast<float>(presence_score[bi]);
|
||||
for (int i = 0; i < length; ++i) {
|
||||
int times = repeat_times_now[i];
|
||||
float logit_now = static_cast<float>(logits_now[i]);
|
||||
if (times == 0) {
|
||||
logits_now[i] =
|
||||
static_cast<float>(logit_now / temperatures[bi]);
|
||||
}
|
||||
logit_now = logit_now < 0 ? logit_now * alpha : logit_now / alpha;
|
||||
logits_now[i] =
|
||||
static_cast<float>(logit_now - times * beta - gamma);
|
||||
}
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
float *logits_now = logits + bi * length;
|
||||
const int *repeat_times_now = repeat_times + bi * length;
|
||||
float alpha = static_cast<float>(penalty_scores[bi]);
|
||||
float beta = static_cast<float>(frequency_score[bi]);
|
||||
float gamma = static_cast<float>(presence_score[bi]);
|
||||
for (int i = 0; i < length; ++i) {
|
||||
int times = repeat_times_now[i];
|
||||
float logit_now = static_cast<float>(logits_now[i]);
|
||||
if (times == 0) {
|
||||
logits_now[i] = static_cast<float>(logit_now / temperatures[bi]);
|
||||
}
|
||||
logit_now = logit_now < 0 ? logit_now * alpha : logit_now / alpha;
|
||||
logits_now[i] = static_cast<float>(logit_now - times * beta - gamma);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ban_bad_words(float *logits,
|
||||
@@ -90,15 +88,14 @@ void ban_bad_words(float *logits,
|
||||
const int64_t bs,
|
||||
const int64_t length,
|
||||
const int64_t bad_words_length) {
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
float *logits_now = logits + bi * length;
|
||||
for (int bwid = 0; bwid < bad_words_length; ++bwid) {
|
||||
const int64_t bad_words_token_id = bad_words_list[bwid];
|
||||
if (bad_words_token_id >= length || bad_words_token_id < 0)
|
||||
continue;
|
||||
logits_now[bad_words_token_id] = -1e10;
|
||||
}
|
||||
for (int bi = 0; bi < bs; ++bi) {
|
||||
float *logits_now = logits + bi * length;
|
||||
for (int bwid = 0; bwid < bad_words_length; ++bwid) {
|
||||
const int64_t bad_words_token_id = bad_words_list[bwid];
|
||||
if (bad_words_token_id >= length || bad_words_token_id < 0) continue;
|
||||
logits_now[bad_words_token_id] = -1e10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <paddle::DataType D>
|
||||
@@ -112,44 +109,44 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
|
||||
const paddle::Tensor &cur_len,
|
||||
const paddle::Tensor &min_len,
|
||||
const paddle::Tensor &eos_token_id) {
|
||||
std::vector<int64_t> shape = logits.shape();
|
||||
auto repeat_times =
|
||||
paddle::full(shape, 0, paddle::DataType::INT32, pre_ids.place());
|
||||
int64_t bs = shape[0];
|
||||
int64_t length = shape[1];
|
||||
int64_t length_id = pre_ids.shape()[1];
|
||||
int64_t end_length = eos_token_id.shape()[0];
|
||||
int64_t length_bad_words = bad_tokens.shape()[0];
|
||||
std::vector<int64_t> shape = logits.shape();
|
||||
auto repeat_times =
|
||||
paddle::full(shape, 0, paddle::DataType::INT32, pre_ids.place());
|
||||
int64_t bs = shape[0];
|
||||
int64_t length = shape[1];
|
||||
int64_t length_id = pre_ids.shape()[1];
|
||||
int64_t end_length = eos_token_id.shape()[0];
|
||||
int64_t length_bad_words = bad_tokens.shape()[0];
|
||||
|
||||
min_length_logits_process(const_cast<float *>(logits.data<float>()),
|
||||
cur_len.data<int64_t>(),
|
||||
min_len.data<int64_t>(),
|
||||
eos_token_id.data<int64_t>(),
|
||||
bs,
|
||||
length,
|
||||
end_length);
|
||||
min_length_logits_process(const_cast<float *>(logits.data<float>()),
|
||||
cur_len.data<int64_t>(),
|
||||
min_len.data<int64_t>(),
|
||||
eos_token_id.data<int64_t>(),
|
||||
bs,
|
||||
length,
|
||||
end_length);
|
||||
|
||||
update_repeat_times(pre_ids.data<int64_t>(),
|
||||
cur_len.data<int64_t>(),
|
||||
repeat_times.data<int>(),
|
||||
bs,
|
||||
length,
|
||||
length_id);
|
||||
update_repeat_times(pre_ids.data<int64_t>(),
|
||||
cur_len.data<int64_t>(),
|
||||
repeat_times.data<int>(),
|
||||
bs,
|
||||
length,
|
||||
length_id);
|
||||
|
||||
update_value_by_repeat_times(repeat_times.data<int>(),
|
||||
penalty_scores.data<float>(),
|
||||
frequency_score.data<float>(),
|
||||
presence_score.data<float>(),
|
||||
temperatures.data<float>(),
|
||||
const_cast<float *>(logits.data<float>()),
|
||||
bs,
|
||||
length);
|
||||
update_value_by_repeat_times(repeat_times.data<int>(),
|
||||
penalty_scores.data<float>(),
|
||||
frequency_score.data<float>(),
|
||||
presence_score.data<float>(),
|
||||
temperatures.data<float>(),
|
||||
const_cast<float *>(logits.data<float>()),
|
||||
bs,
|
||||
length);
|
||||
|
||||
ban_bad_words(const_cast<float *>(logits.data<float>()),
|
||||
bad_tokens.data<int64_t>(),
|
||||
bs,
|
||||
length,
|
||||
length_bad_words);
|
||||
ban_bad_words(const_cast<float *>(logits.data<float>()),
|
||||
bad_tokens.data<int64_t>(),
|
||||
bs,
|
||||
length,
|
||||
length_bad_words);
|
||||
}
|
||||
|
||||
void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
@@ -162,17 +159,17 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
const paddle::Tensor &cur_len,
|
||||
const paddle::Tensor &min_len,
|
||||
const paddle::Tensor &eos_token_id) {
|
||||
return token_penalty_multi_scores_kernel<paddle::DataType::FLOAT32>(
|
||||
pre_ids,
|
||||
logits,
|
||||
penalty_scores,
|
||||
frequency_scores,
|
||||
presence_scores,
|
||||
temperatures,
|
||||
bad_tokens,
|
||||
cur_len,
|
||||
min_len,
|
||||
eos_token_id);
|
||||
return token_penalty_multi_scores_kernel<paddle::DataType::FLOAT32>(
|
||||
pre_ids,
|
||||
logits,
|
||||
penalty_scores,
|
||||
frequency_scores,
|
||||
presence_scores,
|
||||
temperatures,
|
||||
bad_tokens,
|
||||
cur_len,
|
||||
min_len,
|
||||
eos_token_id);
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(get_token_penalty_multi_scores_cpu)
|
||||
|
||||
@@ -24,50 +24,50 @@ void update_inputs_kernel(bool *not_need_stop,
|
||||
const int64_t *next_tokens,
|
||||
const int bsz,
|
||||
const int input_ids_stride) {
|
||||
int64_t stop_sum = 0;
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
bool stop_flag_now = false;
|
||||
int64_t stop_flag_now_int = 0;
|
||||
stop_flag_now = stop_flags[bi];
|
||||
stop_flag_now_int = static_cast<int64_t>(stop_flag_now);
|
||||
auto seq_len_this_time = seq_lens_this_time[bi];
|
||||
auto seq_len_encoder = seq_lens_encoder[bi];
|
||||
auto seq_len_decoder = seq_lens_decoder[bi];
|
||||
seq_lens_decoder[bi] =
|
||||
stop_flag_now ? 0
|
||||
: (seq_len_decoder == 0 ? seq_len_encoder
|
||||
: seq_len_decoder + 1);
|
||||
seq_lens_this_time[bi] = stop_flag_now ? 0 : 1;
|
||||
seq_lens_encoder[bi] = 0;
|
||||
int64_t *input_ids_now = input_ids + bi * input_ids_stride;
|
||||
input_ids_now[0] = next_tokens[bi];
|
||||
stop_sum += stop_flag_now_int;
|
||||
}
|
||||
not_need_stop[0] = stop_sum < stop_nums[0];
|
||||
int64_t stop_sum = 0;
|
||||
for (int bi = 0; bi < bsz; ++bi) {
|
||||
bool stop_flag_now = false;
|
||||
int64_t stop_flag_now_int = 0;
|
||||
stop_flag_now = stop_flags[bi];
|
||||
stop_flag_now_int = static_cast<int64_t>(stop_flag_now);
|
||||
auto seq_len_this_time = seq_lens_this_time[bi];
|
||||
auto seq_len_encoder = seq_lens_encoder[bi];
|
||||
auto seq_len_decoder = seq_lens_decoder[bi];
|
||||
seq_lens_decoder[bi] =
|
||||
stop_flag_now
|
||||
? 0
|
||||
: (seq_len_decoder == 0 ? seq_len_encoder : seq_len_decoder + 1);
|
||||
seq_lens_this_time[bi] = stop_flag_now ? 0 : 1;
|
||||
seq_lens_encoder[bi] = 0;
|
||||
int64_t *input_ids_now = input_ids + bi * input_ids_stride;
|
||||
input_ids_now[0] = next_tokens[bi];
|
||||
stop_sum += stop_flag_now_int;
|
||||
}
|
||||
not_need_stop[0] = stop_sum < stop_nums[0];
|
||||
}
|
||||
|
||||
void UpdateInputs(const paddle::Tensor &stop_flags,
|
||||
const paddle::Tensor ¬_need_stop,
|
||||
const paddle::Tensor &seq_lens_this_time,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &input_ids,
|
||||
const paddle::Tensor &stop_nums,
|
||||
const paddle::Tensor &next_tokens,
|
||||
const paddle::Tensor &is_block_step) {
|
||||
const int bsz = input_ids.shape()[0];
|
||||
const int input_ids_stride = input_ids.shape()[1];
|
||||
update_inputs_kernel(const_cast<bool *>(not_need_stop.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
const_cast<int *>(seq_lens_encoder.data<int>()),
|
||||
const_cast<int *>(seq_lens_decoder.data<int>()),
|
||||
const_cast<int64_t *>(input_ids.data<int64_t>()),
|
||||
stop_nums.data<int64_t>(),
|
||||
stop_flags.data<bool>(),
|
||||
is_block_step.data<bool>(),
|
||||
next_tokens.data<int64_t>(),
|
||||
bsz,
|
||||
input_ids_stride);
|
||||
const paddle::Tensor ¬_need_stop,
|
||||
const paddle::Tensor &seq_lens_this_time,
|
||||
const paddle::Tensor &seq_lens_encoder,
|
||||
const paddle::Tensor &seq_lens_decoder,
|
||||
const paddle::Tensor &input_ids,
|
||||
const paddle::Tensor &stop_nums,
|
||||
const paddle::Tensor &next_tokens,
|
||||
const paddle::Tensor &is_block_step) {
|
||||
const int bsz = input_ids.shape()[0];
|
||||
const int input_ids_stride = input_ids.shape()[1];
|
||||
update_inputs_kernel(const_cast<bool *>(not_need_stop.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
const_cast<int *>(seq_lens_encoder.data<int>()),
|
||||
const_cast<int *>(seq_lens_decoder.data<int>()),
|
||||
const_cast<int64_t *>(input_ids.data<int64_t>()),
|
||||
stop_nums.data<int64_t>(),
|
||||
stop_flags.data<bool>(),
|
||||
is_block_step.data<bool>(),
|
||||
next_tokens.data<int64_t>(),
|
||||
bsz,
|
||||
input_ids_stride);
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(update_inputs_cpu)
|
||||
|
||||
@@ -45,18 +45,18 @@ std::vector<paddle::Tensor> InvokeAllLLaMALayer(
|
||||
int maxPositions,
|
||||
int maxPosEmbed,
|
||||
int intermediateSize) {
|
||||
auto out = paddle::empty_like(input);
|
||||
return {out};
|
||||
auto out = paddle::empty_like(input);
|
||||
return {out};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
|
||||
std::vector<int64_t> x_shape) {
|
||||
return {x_shape};
|
||||
return {x_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> AllLLaMALayerInferDtype(
|
||||
paddle::DataType x_dtype) {
|
||||
return {x_dtype};
|
||||
return {x_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(xft_llama_all_layer)
|
||||
|
||||
@@ -16,20 +16,20 @@
|
||||
#include "paddle/extension.h"
|
||||
|
||||
std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto next_tokens =
|
||||
paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
|
||||
return {next_tokens};
|
||||
const int bsz = probs.shape()[0];
|
||||
const int vocab_size = probs.shape()[1];
|
||||
auto next_tokens =
|
||||
paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
|
||||
return {next_tokens};
|
||||
}
|
||||
std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
|
||||
const std::vector<int64_t> &probs_shape) {
|
||||
int64_t bsz = probs_shape[0];
|
||||
return {{bsz, 1}};
|
||||
int64_t bsz = probs_shape[0];
|
||||
return {{bsz, 1}};
|
||||
}
|
||||
std::vector<paddle::DataType> XftGreedySearchInferDtype(
|
||||
const paddle::DataType &probs_dtype) {
|
||||
return {paddle::DataType::INT64};
|
||||
return {paddle::DataType::INT64};
|
||||
}
|
||||
PD_BUILD_STATIC_OP(xft_greedy_search)
|
||||
.Inputs({"probs"})
|
||||
|
||||
Reference in New Issue
Block a user