mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
fix typos (#3684)
This commit is contained in:
@@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
|
||||
m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
|
||||
py::arg("block_size"),
|
||||
"per token per block quant and padding tranpose scale");
|
||||
"per token per block quant and padding transpose scale");
|
||||
|
||||
m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
|
||||
py::arg("recv_expert_count"), py::arg("block_size"),
|
||||
|
||||
@@ -89,11 +89,11 @@ public:
|
||||
GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
|
||||
Shape::kK / WarpGemm::kK>;
|
||||
|
||||
/// Number of warp-level GEMM oeprations
|
||||
/// Number of warp-level GEMM operations
|
||||
static int const kWarpGemmIterations =
|
||||
(WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||
|
||||
/// Number of warp-level GEMM oeprations per load for B
|
||||
/// Number of warp-level GEMM operations per load for B
|
||||
static constexpr int kWarpGemmIterationsPerLoadForB =
|
||||
Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
|
||||
static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
|
||||
|
||||
@@ -117,7 +117,7 @@ class LeftGELUAndMul {
|
||||
CUTLASS_HOST_DEVICE
|
||||
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
||||
FragmentAccumulator const &rhs) const {
|
||||
// Convert source to interal compute numeric type
|
||||
// Convert source to internal compute numeric type
|
||||
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
||||
accumulator_to_compute;
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
|
||||
CUTLASS_HOST_DEVICE
|
||||
FragmentOutput operator()(FragmentAccumulator const &lhs,
|
||||
FragmentAccumulator const &rhs) const {
|
||||
// Convert source to interal compute numeric type
|
||||
// Convert source to internal compute numeric type
|
||||
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
|
||||
accumulator_to_compute;
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ class DualMmaBase {
|
||||
Shape::kN / WarpGemm::kN,
|
||||
Shape::kK / WarpGemm::kK>;
|
||||
|
||||
/// Number of warp-level GEMM oeprations
|
||||
/// Number of warp-level GEMM operations
|
||||
static int const kWarpGemmIterations =
|
||||
(WarpGemm::kK / Operator0::Policy::MmaShape::kK);
|
||||
|
||||
|
||||
@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
|
||||
iterator_C_.clear_mask();
|
||||
}
|
||||
// NOTE(wangbojun) Currently, this kernel don't hanve implantention for
|
||||
// adding elementwise beta, we keep this here for future useage beta_ =
|
||||
// adding elementwise beta, we keep this here for future usage beta_ =
|
||||
// (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
|
||||
// params.elementwise.beta); if (beta_ == ElementAccumulator()) {
|
||||
// iterator_C_.clear_mask();
|
||||
|
||||
@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
|
||||
///
|
||||
/// Satisfies: ReadableTileIterator
|
||||
///
|
||||
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
|
||||
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
|
||||
>
|
||||
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
||||
public:
|
||||
|
||||
@@ -64,7 +64,7 @@ template <
|
||||
typename InstructionShape_,
|
||||
/// Number of stages used in the pipelined mainloop
|
||||
int Stages,
|
||||
/// Operation perfomed by GEMM
|
||||
/// Operation performed by GEMM
|
||||
typename Operator,
|
||||
/// Store the accumulators in row major or column major. Row major is used
|
||||
/// when output layout is interleaved.
|
||||
|
||||
@@ -133,7 +133,7 @@ public:
|
||||
/// Shape describing the number of warps filling the CTA
|
||||
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
||||
|
||||
/// Number of warp-level GEMM oeprations
|
||||
/// Number of warp-level GEMM operations
|
||||
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
||||
static constexpr int kNumKIterationsPerWarpBLoad =
|
||||
|
||||
@@ -509,7 +509,7 @@ public:
|
||||
this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
|
||||
++this->warp_tile_iterator_B_;
|
||||
}
|
||||
// TOOD(wangbojun) lds_converter can be remove for int8 B input
|
||||
// TODO(wangbojun) lds_converter can be remove for int8 B input
|
||||
typename TransformBAfterLDS::result_type converted_frag_B =
|
||||
lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ public:
|
||||
/// Shape describing the number of warps filling the CTA
|
||||
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
|
||||
|
||||
/// Number of warp-level GEMM oeprations
|
||||
/// Number of warp-level GEMM operations
|
||||
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
|
||||
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
|
||||
static constexpr int kNumKIterationsPerWarpBLoad =
|
||||
|
||||
@@ -646,7 +646,7 @@ public:
|
||||
// );
|
||||
// }
|
||||
}
|
||||
// TOOD(wangbojun) lds_converter can be remove for int8 B input
|
||||
// TODO(wangbojun) lds_converter can be remove for int8 B input
|
||||
// int4
|
||||
// typename TransformBAfterLDS::result_type converted_frag_B =
|
||||
// lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
|
||||
|
||||
@@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp<cutlass::bfloat16_t,
|
||||
///
|
||||
/// Satisfies: ReadableTileIterator
|
||||
///
|
||||
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
|
||||
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
|
||||
>
|
||||
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
|
||||
public:
|
||||
|
||||
@@ -80,7 +80,7 @@ void MoeDispatchKernel(
|
||||
if (group_moe) {
|
||||
paddle::Tensor softmax_max_prob_tensor =
|
||||
GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
|
||||
// (TODO: check fill sucess ?)
|
||||
// (TODO: check fill success ?)
|
||||
paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
|
||||
softmax_max_prob = softmax_max_prob_tensor.data<float>();
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x,
|
||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||
if (inference_msg_id_from_env == 2) {
|
||||
// 2 and -2 is perserve for no-output indication.
|
||||
// 2 and -2 is preserve for no-output indication.
|
||||
throw std::runtime_error(
|
||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x,
|
||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||
if (inference_msg_id_from_env == 2) {
|
||||
// 2 and -2 is perserve for no-output indication.
|
||||
// 2 and -2 is preserve for no-output indication.
|
||||
throw std::runtime_error(
|
||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
|
||||
const int64_t *input_ids_now = input_ids + tid * length_input_ids;
|
||||
const int seq_len_dec = seq_lens_decoder[tid];
|
||||
const int seq_len_enc = seq_lens_encoder[tid];
|
||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
|
||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
|
||||
if (step_idx[tid] >= 0) {
|
||||
if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
|
||||
pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
|
||||
|
||||
@@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel(
|
||||
position_map[in_offset++] = out_offset++;
|
||||
}
|
||||
in_offset += cur_base_model_seq_lens_this_time - accept_num;
|
||||
// (liuzichang): Temperary Reserved for debug
|
||||
// (liuzichang): Temporary Reserved for debug
|
||||
// if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
|
||||
// #ifdef DEBUG_EAGLE_KERNEL
|
||||
// printf("batch %d: accept_num <= actual_draft_token_num \n", i);
|
||||
|
||||
@@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
|
||||
accept_tokens + tid * max_draft_tokens;
|
||||
const int seq_len_dec = seq_lens_decoder[tid];
|
||||
const int seq_len_enc = seq_lens_encoder[tid];
|
||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped
|
||||
if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped
|
||||
// printf("step_idx[tid] %d\n", step_idx[tid]);
|
||||
if (step_idx[tid] >= 0) {
|
||||
for (int i = 0; i < accept_num[tid]; i++) {
|
||||
|
||||
@@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags,
|
||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||
if (inference_msg_id_from_env == 2) {
|
||||
// 2 and -2 is perserve for no-output indication.
|
||||
// 2 and -2 is preserve for no-output indication.
|
||||
throw std::runtime_error(
|
||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||
}
|
||||
|
||||
@@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags,
|
||||
std::string inference_msg_id_env_str(inference_msg_id_env_p);
|
||||
inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
|
||||
if (inference_msg_id_from_env == 2) {
|
||||
// 2 and -2 is perserve for no-output indication.
|
||||
// 2 and -2 is preserve for no-output indication.
|
||||
throw std::runtime_error(
|
||||
" INFERENCE_MSG_ID cannot be 2, please use other number.");
|
||||
}
|
||||
|
||||
@@ -58,7 +58,7 @@ class TokenTransfer {
|
||||
}
|
||||
|
||||
// once copy: cpu --> cpu
|
||||
// arrary length should be (1 + MAX_BATCH)
|
||||
// array length should be (1 + MAX_BATCH)
|
||||
bool GetBatchToken(int64_t *array) {
|
||||
if (Empty()) {
|
||||
return false;
|
||||
|
||||
@@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens,
|
||||
const int max_seq_len,
|
||||
const int max_batch_size,
|
||||
const int split_fuse_size) {
|
||||
dim3 girds;
|
||||
girds.x = max_batch_size;
|
||||
dim3 grids;
|
||||
grids.x = max_batch_size;
|
||||
const int block_size = 128;
|
||||
update_split_fuse_inputs_kernel<<<girds,
|
||||
update_split_fuse_inputs_kernel<<<grids,
|
||||
block_size,
|
||||
0,
|
||||
input_ids.stream()>>>(
|
||||
|
||||
Reference in New Issue
Block a user