fix typos (#3684)

2025-12-24 13:28:13 +08:00 · 2025-09-01 17:50:17 +08:00
parent 0513a78ecc
commit d6369b4d51
67 changed files with 85 additions and 85 deletions
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
        py::arg("block_size"),
-        "per token per block quant and padding tranpose scale");
+        "per token per block quant and padding transpose scale");

  m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
        py::arg("recv_expert_count"), py::arg("block_size"),
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
@@ -89,11 +89,11 @@ public:
      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
                Shape::kK / WarpGemm::kK>;

-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
  static int const kWarpGemmIterations =
      (WarpGemm::kK / Operator::Policy::MmaShape::kK);

-  /// Number of warp-level GEMM oeprations per load for B
+  /// Number of warp-level GEMM operations per load for B
  static constexpr int kWarpGemmIterationsPerLoadForB =
      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
  static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h
@@ -117,7 +117,7 @@ class LeftGELUAndMul {
    CUTLASS_HOST_DEVICE
    FragmentOutput operator()(FragmentAccumulator const &lhs,
                              FragmentAccumulator const &rhs) const {
-        // Convert source to interal compute numeric type
+        // Convert source to internal compute numeric type
        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
            accumulator_to_compute;

--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h
@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
    CUTLASS_HOST_DEVICE
    FragmentOutput operator()(FragmentAccumulator const &lhs,
                              FragmentAccumulator const &rhs) const {
-        // Convert source to interal compute numeric type
+        // Convert source to internal compute numeric type
        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
            accumulator_to_compute;

--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h
@@ -92,7 +92,7 @@ class DualMmaBase {
                              Shape::kN / WarpGemm::kN,
                              Shape::kK / WarpGemm::kK>;

-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
  static int const kWarpGemmIterations =
      (WarpGemm::kK / Operator0::Policy::MmaShape::kK);

--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h
@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
      iterator_C_.clear_mask();
    }
    // NOTE(wangbojun) Currently, this kernel don't hanve implantention for
-    // adding elementwise beta, we keep this here for future useage beta_ =
+    // adding elementwise beta, we keep this here for future usage beta_ =
    // (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
    // params.elementwise.beta); if (beta_ == ElementAccumulator()) {
    //     iterator_C_.clear_mask();
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
 ///
 /// Satisfies: ReadableTileIterator
 ///
-template <typename ThreadMap_  ///< Thread map (conept: OutputTileThreadMap)
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
          >
 class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
 public:
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h
@@ -64,7 +64,7 @@ template <
    typename InstructionShape_,
    /// Number of stages used in the pipelined mainloop
    int Stages,
-    /// Operation perfomed by GEMM
+    /// Operation performed by GEMM
    typename Operator,
    /// Store the accumulators in row major or column major.  Row major is used
    /// when output layout is interleaved.
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h
@@ -133,7 +133,7 @@ public:
    /// Shape describing the number of warps filling the CTA
    using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;

-    /// Number of warp-level GEMM oeprations
+    /// Number of warp-level GEMM operations
    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
    static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
    static constexpr int kNumKIterationsPerWarpBLoad =
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h
@@ -509,7 +509,7 @@ public:
                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
                    ++this->warp_tile_iterator_B_;
                }
-                // TOOD(wangbojun) lds_converter can be remove for int8 B input
+                // TODO(wangbojun) lds_converter can be remove for int8 B input
                typename TransformBAfterLDS::result_type converted_frag_B =
                    lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);

--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h
@@ -96,7 +96,7 @@ public:
    /// Shape describing the number of warps filling the CTA
    using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;

-    /// Number of warp-level GEMM oeprations
+    /// Number of warp-level GEMM operations
    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
    static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
    static constexpr int kNumKIterationsPerWarpBLoad =
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h
@@ -646,7 +646,7 @@ public:
                    //             );
                    // }
                }
-                // TOOD(wangbojun) lds_converter can be remove for int8 B input
+                // TODO(wangbojun) lds_converter can be remove for int8 B input
                // int4
                // typename TransformBAfterLDS::result_type converted_frag_B =
                //     lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
--- a/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h
+++ b/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h
@@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp<cutlass::bfloat16_t,
 ///
 /// Satisfies: ReadableTileIterator
 ///
-template <typename ThreadMap_  ///< Thread map (conept: OutputTileThreadMap)
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
          >
 class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
    public:
--- a/custom_ops/gpu_ops/moe/moe_dispatch.cu
+++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu
@@ -80,7 +80,7 @@ void MoeDispatchKernel(
  if (group_moe) {
    paddle::Tensor softmax_max_prob_tensor =
        GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-    // (TODO: check fill sucess ?)
+    // (TODO: check fill success ?)
    paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
    softmax_max_prob = softmax_max_prob_tensor.data<float>();
  }
--- a/custom_ops/gpu_ops/save_output_msg_with_topk.cc
+++ b/custom_ops/gpu_ops/save_output_msg_with_topk.cc
@@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x,
        std::string inference_msg_id_env_str(inference_msg_id_env_p);
        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is perserve for no-output indication.
+            // 2 and -2 is preserve for no-output indication.
            throw std::runtime_error(
                " INFERENCE_MSG_ID cannot be 2, please use other number.");
        }
--- a/custom_ops/gpu_ops/save_with_output_msg.cc
+++ b/custom_ops/gpu_ops/save_with_output_msg.cc
@@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x,
        std::string inference_msg_id_env_str(inference_msg_id_env_p);
        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is perserve for no-output indication.
+            // 2 and -2 is preserve for no-output indication.
            throw std::runtime_error(
                " INFERENCE_MSG_ID cannot be 2, please use other number.");
        }
--- a/custom_ops/gpu_ops/set_value_by_flags.cu
+++ b/custom_ops/gpu_ops/set_value_by_flags.cu
@@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
        const int64_t *input_ids_now = input_ids + tid * length_input_ids;
        const int seq_len_dec = seq_lens_decoder[tid];
        const int seq_len_enc = seq_lens_encoder[tid];
-        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
+        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
        if (step_idx[tid] >= 0) {
            if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
                pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
--- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
@@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel(
          position_map[in_offset++] = out_offset++;
        }
        in_offset += cur_base_model_seq_lens_this_time - accept_num;
-// (liuzichang): Temperary Reserved for debug
+// (liuzichang): Temporary Reserved for debug
 //         if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
 // #ifdef DEBUG_EAGLE_KERNEL
 //         printf("batch %d: accept_num <= actual_draft_token_num \n", i);
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu
@@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
            accept_tokens + tid * max_draft_tokens;
        const int seq_len_dec = seq_lens_decoder[tid];
        const int seq_len_enc = seq_lens_encoder[tid];
-        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
+        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
        // printf("step_idx[tid] %d\n", step_idx[tid]);
        if (step_idx[tid] >= 0) {
            for (int i = 0; i < accept_num[tid]; i++) {
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
@@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags,
            std::string inference_msg_id_env_str(inference_msg_id_env_p);
            inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
            if (inference_msg_id_from_env == 2) {
-                // 2 and -2 is perserve for no-output indication.
+                // 2 and -2 is preserve for no-output indication.
                throw std::runtime_error(
                    " INFERENCE_MSG_ID cannot be 2, please use other number.");
            }
--- a/custom_ops/gpu_ops/step_reschedule.cu
+++ b/custom_ops/gpu_ops/step_reschedule.cu
@@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags,
            std::string inference_msg_id_env_str(inference_msg_id_env_p);
            inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
            if (inference_msg_id_from_env == 2) {
-                // 2 and -2 is perserve for no-output indication.
+                // 2 and -2 is preserve for no-output indication.
                throw std::runtime_error(
                    " INFERENCE_MSG_ID cannot be 2, please use other number.");
            }
--- a/custom_ops/gpu_ops/token_transfer.hpp
+++ b/custom_ops/gpu_ops/token_transfer.hpp
@@ -58,7 +58,7 @@ class TokenTransfer {
    }

    // once copy: cpu --> cpu
-    // arrary length should be (1 + MAX_BATCH)
+    // array length should be (1 + MAX_BATCH)
    bool GetBatchToken(int64_t *array) {
        if (Empty()) {
            return false;
--- a/custom_ops/gpu_ops/update_split_fuse_input.cu
+++ b/custom_ops/gpu_ops/update_split_fuse_input.cu
@@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens,
                            const int max_seq_len,
                            const int max_batch_size,
                            const int split_fuse_size) {
-    dim3 girds;
-    girds.x = max_batch_size;
+    dim3 grids;
+    grids.x = max_batch_size;
    const int block_size = 128;
-    update_split_fuse_inputs_kernel<<<girds,
+    update_split_fuse_inputs_kernel<<<grids,
                                      block_size,
                                      0,
                                      input_ids.stream()>>>(