mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -19,7 +19,7 @@
|
||||
// #define DEBUG_EAGLE_KERNEL
|
||||
|
||||
__global__ void ComputeOrderKernel(
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
const int* base_model_seq_lens_this_time,
|
||||
const int* base_model_seq_lens_encoder,
|
||||
@@ -47,7 +47,7 @@ __global__ void ComputeOrderKernel(
|
||||
printf("batch %d: cur_seq_lens_encoder > 0 \n", i);
|
||||
#endif
|
||||
for (int j = 0; j < cur_seq_lens_encoder; j++) {
|
||||
position_map[in_offset++] = out_offset++;
|
||||
position_map[in_offset++] = out_offset++;
|
||||
}
|
||||
// 2. base model encoder. Base step=0
|
||||
} else if (cur_base_model_seq_lens_encoder != 0) {
|
||||
@@ -69,13 +69,13 @@ __global__ void ComputeOrderKernel(
|
||||
in_offset += cur_base_model_seq_lens_this_time;
|
||||
} else /*Accept all draft tokens*/ {
|
||||
#ifdef DEBUG_EAGLE_KERNEL
|
||||
printf("batch %d: accept_num > actual_draft_token_num \n", i);
|
||||
printf("batch %d: accept_num > actual_draft_token_num \n", i);
|
||||
#endif
|
||||
position_map[in_offset + accept_num - 2] = out_offset++;
|
||||
position_map[in_offset + accept_num - 1] = out_offset++;
|
||||
in_offset += cur_base_model_seq_lens_this_time;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
output_token_num[0] = out_offset;
|
||||
#ifdef DEBUG_EAGLE_KERNEL
|
||||
@@ -208,7 +208,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
}
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
return DispatchDtype<paddle::DataType::BFLOAT16>(
|
||||
input,
|
||||
input,
|
||||
seq_lens_this_time,
|
||||
seq_lens_encoder,
|
||||
seq_lens_decoder,
|
||||
|
@@ -72,7 +72,7 @@ __global__ void computeOrderKernel(
|
||||
output_token_num[0] = out_offset;
|
||||
#ifdef DEBUG_EAGLE_KERNEL
|
||||
printf("position map output_token_num%d:\n", output_token_num[0]);
|
||||
for (int i = 0; i < output_token_num[0]; i++) {
|
||||
for (int i = 0; i < output_token_num[0]; i++) {
|
||||
printf("%d ", src_map[i]);
|
||||
}
|
||||
printf("\n");
|
||||
@@ -187,4 +187,4 @@ PD_BUILD_STATIC_OP(eagle_get_self_hidden_states)
|
||||
"seq_lens_this_time",
|
||||
"step_idx"})
|
||||
.Outputs({"out"})
|
||||
.SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates));
|
||||
.SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates));
|
||||
|
@@ -26,7 +26,7 @@ __global__ void RebuildAppendPaddingKernel(
|
||||
const int seq_len,
|
||||
const int dim_embed,
|
||||
const size_t elem_nums) {
|
||||
using LoadT = AlignedVector<T, VecSize>;
|
||||
using LoadT = AlignedVector<T, VecSize>;
|
||||
LoadT src_vec;
|
||||
const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
for (int64_t i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) {
|
||||
@@ -42,7 +42,7 @@ __global__ void RebuildAppendPaddingKernel(
|
||||
|
||||
const int input_token_id = ori_token_id - cum_offset[bi] + seq_id;
|
||||
const int bias_idx = i % dim_embed;
|
||||
|
||||
|
||||
Load<T, VecSize>(&full_hidden_states[input_token_id * dim_embed + bias_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &out[i]);
|
||||
}
|
||||
@@ -78,14 +78,14 @@ std::vector<paddle::Tensor> DispatchDtype(
|
||||
GetNumBlocks(pack_num, &grid_size);
|
||||
|
||||
RebuildAppendPaddingKernel<DataType_, PackSize><<<grid_size, threads_per_block, 0, full_hidden_states.stream()>>>(
|
||||
reinterpret_cast<DataType_*>(out.data<data_t>()),
|
||||
reinterpret_cast<const DataType_*>(full_hidden_states.data<data_t>()),
|
||||
cum_offsets.data<int32_t>(),
|
||||
seq_len_encoder.data<int32_t>(),
|
||||
seq_len_decoder.data<int32_t>(),
|
||||
output_padding_offset.data<int32_t>(),
|
||||
max_seq_len,
|
||||
dim_embed,
|
||||
reinterpret_cast<DataType_*>(out.data<data_t>()),
|
||||
reinterpret_cast<const DataType_*>(full_hidden_states.data<data_t>()),
|
||||
cum_offsets.data<int32_t>(),
|
||||
seq_len_encoder.data<int32_t>(),
|
||||
seq_len_decoder.data<int32_t>(),
|
||||
output_padding_offset.data<int32_t>(),
|
||||
max_seq_len,
|
||||
dim_embed,
|
||||
elem_nums);
|
||||
return {out};
|
||||
}
|
||||
@@ -99,7 +99,7 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
|
||||
const paddle::Tensor& output_padding_offset,
|
||||
const int max_seq_len) {
|
||||
|
||||
|
||||
|
||||
switch (full_hidden_states.dtype()) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
return DispatchDtype<paddle::DataType::BFLOAT16>(
|
||||
@@ -137,7 +137,7 @@ std::vector<paddle::DataType> RebuildAppendPaddingInferDtype(
|
||||
|
||||
|
||||
PD_BUILD_STATIC_OP(speculate_rebuild_append_padding)
|
||||
.Inputs({"full_hidden_states",
|
||||
.Inputs({"full_hidden_states",
|
||||
"cum_offsets",
|
||||
"seq_len_encoder",
|
||||
"seq_len_decoder",
|
||||
@@ -146,4 +146,4 @@ PD_BUILD_STATIC_OP(speculate_rebuild_append_padding)
|
||||
.Outputs({"out"})
|
||||
.SetKernelFn(PD_KERNEL(RebuildAppendPadding))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(RebuildAppendPaddingInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype));
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype));
|
||||
|
@@ -93,7 +93,7 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags,
|
||||
used_list_len[tid] = 0;
|
||||
}
|
||||
} else if (seq_lens_this_time[tid] != 0 && max_possible_block_idx < block_num_per_seq &&
|
||||
block_table_now[(seq_lens_decoder[tid] + max_draft_tokens +
|
||||
block_table_now[(seq_lens_decoder[tid] + max_draft_tokens +
|
||||
1) /
|
||||
block_size] == -1) {
|
||||
// 统计需要分配block的位置和总数
|
||||
@@ -347,7 +347,7 @@ PD_BUILD_STATIC_OP(speculate_step_reschedule)
|
||||
"next_tokens",
|
||||
"first_token_ids",
|
||||
"accept_num"})
|
||||
.Attrs({"block_size: int",
|
||||
.Attrs({"block_size: int",
|
||||
"encoder_decoder_block_num: int",
|
||||
"max_draft_tokens: int"})
|
||||
.Outputs({"stop_flags_out",
|
||||
|
Reference in New Issue
Block a user