fix typos (#3684)

2025-10-04 16:22:57 +08:00 · 2025-09-01 17:50:17 +08:00
parent 0513a78ecc
commit d6369b4d51
67 changed files with 85 additions and 85 deletions
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
        py::arg("block_size"),
-        "per token per block quant and padding tranpose scale");
+        "per token per block quant and padding transpose scale");
  m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
        py::arg("recv_expert_count"), py::arg("block_size"),
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h
@@ -89,11 +89,11 @@ public:
      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
                Shape::kK / WarpGemm::kK>;
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
  static int const kWarpGemmIterations =
      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
-  /// Number of warp-level GEMM oeprations per load for B
+  /// Number of warp-level GEMM operations per load for B
  static constexpr int kWarpGemmIterationsPerLoadForB =
      Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
  static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h
@@ -117,7 +117,7 @@ class LeftGELUAndMul {
    CUTLASS_HOST_DEVICE
    FragmentOutput operator()(FragmentAccumulator const &lhs,
                              FragmentAccumulator const &rhs) const {
-        // Convert source to interal compute numeric type
+        // Convert source to internal compute numeric type
        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
            accumulator_to_compute;
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h
@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
    CUTLASS_HOST_DEVICE
    FragmentOutput operator()(FragmentAccumulator const &lhs,
                              FragmentAccumulator const &rhs) const {
-        // Convert source to interal compute numeric type
+        // Convert source to internal compute numeric type
        NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
            accumulator_to_compute;
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h
@@ -92,7 +92,7 @@ class DualMmaBase {
                              Shape::kN / WarpGemm::kN,
                              Shape::kK / WarpGemm::kK>;
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
  static int const kWarpGemmIterations =
      (WarpGemm::kK / Operator0::Policy::MmaShape::kK);
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h
@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
      iterator_C_.clear_mask();
    }
    // NOTE(wangbojun) Currently, this kernel don't hanve implantention for
-    // adding elementwise beta, we keep this here for future useage beta_ =
+    // adding elementwise beta, we keep this here for future usage beta_ =
    // (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
    // params.elementwise.beta); if (beta_ == ElementAccumulator()) {
    //     iterator_C_.clear_mask();
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h
@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
 ///
 /// Satisfies: ReadableTileIterator
 ///
-template <typename ThreadMap_  ///< Thread map (conept: OutputTileThreadMap)
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
          >
 class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
 public:
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h
@@ -64,7 +64,7 @@ template <
    typename InstructionShape_,
    /// Number of stages used in the pipelined mainloop
    int Stages,
-    /// Operation perfomed by GEMM
+    /// Operation performed by GEMM
    typename Operator,
    /// Store the accumulators in row major or column major.  Row major is used
    /// when output layout is interleaved.
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h
@@ -133,7 +133,7 @@ public:
    /// Shape describing the number of warps filling the CTA
    using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
-    /// Number of warp-level GEMM oeprations
+    /// Number of warp-level GEMM operations
    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
    static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
    static constexpr int kNumKIterationsPerWarpBLoad =
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h
@@ -509,7 +509,7 @@ public:
                    this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
                    ++this->warp_tile_iterator_B_;
                }
-                // TOOD(wangbojun) lds_converter can be remove for int8 B input
+                // TODO(wangbojun) lds_converter can be remove for int8 B input
                typename TransformBAfterLDS::result_type converted_frag_B =
                    lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h
@@ -96,7 +96,7 @@ public:
    /// Shape describing the number of warps filling the CTA
    using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
-    /// Number of warp-level GEMM oeprations
+    /// Number of warp-level GEMM operations
    static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
    static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
    static constexpr int kNumKIterationsPerWarpBLoad =
--- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h
@@ -646,7 +646,7 @@ public:
                    //             );
                    // }
                }
-                // TOOD(wangbojun) lds_converter can be remove for int8 B input
+                // TODO(wangbojun) lds_converter can be remove for int8 B input
                // int4
                // typename TransformBAfterLDS::result_type converted_frag_B =
                //     lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);
--- a/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h
+++ b/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h
@@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp<cutlass::bfloat16_t,
 ///
 /// Satisfies: ReadableTileIterator
 ///
-template <typename ThreadMap_  ///< Thread map (conept: OutputTileThreadMap)
+template <typename ThreadMap_  ///< Thread map (concept: OutputTileThreadMap)
          >
 class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
    public:
--- a/custom_ops/gpu_ops/moe/moe_dispatch.cu
+++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu
@@ -80,7 +80,7 @@ void MoeDispatchKernel(
  if (group_moe) {
    paddle::Tensor softmax_max_prob_tensor =
        GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-    // (TODO: check fill sucess ?)
+    // (TODO: check fill success ?)
    paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
    softmax_max_prob = softmax_max_prob_tensor.data<float>();
  }
--- a/custom_ops/gpu_ops/save_output_msg_with_topk.cc
+++ b/custom_ops/gpu_ops/save_output_msg_with_topk.cc
@@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x,
        std::string inference_msg_id_env_str(inference_msg_id_env_p);
        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is perserve for no-output indication.
+            // 2 and -2 is preserve for no-output indication.
            throw std::runtime_error(
                " INFERENCE_MSG_ID cannot be 2, please use other number.");
        }
--- a/custom_ops/gpu_ops/save_with_output_msg.cc
+++ b/custom_ops/gpu_ops/save_with_output_msg.cc
@@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x,
        std::string inference_msg_id_env_str(inference_msg_id_env_p);
        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is perserve for no-output indication.
+            // 2 and -2 is preserve for no-output indication.
            throw std::runtime_error(
                " INFERENCE_MSG_ID cannot be 2, please use other number.");
        }
--- a/custom_ops/gpu_ops/set_value_by_flags.cu
+++ b/custom_ops/gpu_ops/set_value_by_flags.cu
@@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
        const int64_t *input_ids_now = input_ids + tid * length_input_ids;
        const int seq_len_dec = seq_lens_decoder[tid];
        const int seq_len_enc = seq_lens_encoder[tid];
-        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
+        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
        if (step_idx[tid] >= 0) {
            if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
                pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
--- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
@@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel(
          position_map[in_offset++] = out_offset++;
        }
        in_offset += cur_base_model_seq_lens_this_time - accept_num;
-// (liuzichang): Temperary Reserved for debug
+// (liuzichang): Temporary Reserved for debug
 //         if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ {
 // #ifdef DEBUG_EAGLE_KERNEL
 //         printf("batch %d: accept_num <= actual_draft_token_num \n", i);
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu
@@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
            accept_tokens + tid * max_draft_tokens;
        const int seq_len_dec = seq_lens_decoder[tid];
        const int seq_len_enc = seq_lens_encoder[tid];
-        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
+        if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
        // printf("step_idx[tid] %d\n", step_idx[tid]);
        if (step_idx[tid] >= 0) {
            for (int i = 0; i < accept_num[tid]; i++) {
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
@@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags,
            std::string inference_msg_id_env_str(inference_msg_id_env_p);
            inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
            if (inference_msg_id_from_env == 2) {
-                // 2 and -2 is perserve for no-output indication.
+                // 2 and -2 is preserve for no-output indication.
                throw std::runtime_error(
                    " INFERENCE_MSG_ID cannot be 2, please use other number.");
            }
--- a/custom_ops/gpu_ops/step_reschedule.cu
+++ b/custom_ops/gpu_ops/step_reschedule.cu
@@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags,
            std::string inference_msg_id_env_str(inference_msg_id_env_p);
            inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
            if (inference_msg_id_from_env == 2) {
-                // 2 and -2 is perserve for no-output indication.
+                // 2 and -2 is preserve for no-output indication.
                throw std::runtime_error(
                    " INFERENCE_MSG_ID cannot be 2, please use other number.");
            }
--- a/custom_ops/gpu_ops/token_transfer.hpp
+++ b/custom_ops/gpu_ops/token_transfer.hpp
@@ -58,7 +58,7 @@ class TokenTransfer {
    }
    // once copy: cpu --> cpu
-    // arrary length should be (1 + MAX_BATCH)
+    // array length should be (1 + MAX_BATCH)
    bool GetBatchToken(int64_t *array) {
        if (Empty()) {
            return false;
--- a/custom_ops/gpu_ops/update_split_fuse_input.cu
+++ b/custom_ops/gpu_ops/update_split_fuse_input.cu
@@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens,
                            const int max_seq_len,
                            const int max_batch_size,
                            const int split_fuse_size) {
-    dim3 girds;
+    dim3 grids;
-    girds.x = max_batch_size;
+    grids.x = max_batch_size;
    const int block_size = 128;
-    update_split_fuse_inputs_kernel<<<girds,
+    update_split_fuse_inputs_kernel<<<grids,
                                      block_size,
                                      0,
                                      input_ids.stream()>>>(
--- a/custom_ops/iluvatar_ops/moe_dispatch.cu
+++ b/custom_ops/iluvatar_ops/moe_dispatch.cu
@@ -110,7 +110,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
  if (group_moe) {
    paddle::Tensor softmax_max_prob_tensor =
        GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-    // (TODO: check fill sucess ?)
+    // (TODO: check fill success ?)
    paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
    softmax_max_prob = softmax_max_prob_tensor.data<float>();
  }
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -507,7 +507,7 @@ elif paddle.is_compiled_with_cuda():
        sources += find_end_files(fp8_auto_gen_directory, ".cu")
    if cc >= 90 and nvcc_version >= 12.0:
-        # Hopper optmized mla
+        # Hopper optimized mla
        sources += find_end_files("gpu_ops/mla_attn", ".cu")
        sources += ["gpu_ops/flash_mask_attn/flash_mask_attn.cu"]
        sources += find_end_files("gpu_ops/moba_attn/moba_decoder_attn/", ".cu")
--- a/custom_ops/xpu_ops/src/ops/moe_layer.cc
+++ b/custom_ops/xpu_ops/src/ops/moe_layer.cc
@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> MoeLayerKernel(
    const auto xtype = x.dtype();
    auto x_dims = x.shape();
    auto up_gate_proj_dims = up_gate_proj_weight.shape();
-    PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2.");
+    PD_CHECK(x_dims.size() == 2, "x_dims.size() should be 2.");
    PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3.");
    PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support.");
    if (quant_method == "weight_only_int4") {
--- a/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc
+++ b/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc
@@ -122,7 +122,7 @@ void SpeculateStepSchedule(
      std::string inference_msg_id_env_str(inference_msg_id_env_p);
      inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
      if (inference_msg_id_from_env == 2) {
-        // 2 and -2 is perserve for no-output indication.
+        // 2 and -2 is preserve for no-output indication.
        throw std::runtime_error(
            " INFERENCE_MSG_ID cannot be 2, please use other number.");
      }
--- a/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc
+++ b/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc
@@ -59,7 +59,7 @@ void SaveOutMmsg(const paddle::Tensor &x, const paddle::Tensor &not_need_stop,
        std::string inference_msg_id_env_str(inference_msg_id_env_p);
        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is perserve for no-output indication.
+            // 2 and -2 is preserve for no-output indication.
            throw std::runtime_error(
                " INFERENCE_MSG_ID cannot be 2, please use other number.");
        }
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu
@@ -4,7 +4,7 @@
 namespace xpu3 {
 namespace plugin {
 #define MAX_LM_SIZE 28672
-// One core has 32KB LM（gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
+// One core has 32KB LM（group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
 #define MAX_BATCH 512
 #define ALIGNMENT 64
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu
@@ -4,7 +4,7 @@
 namespace xpu3 {
 namespace plugin {
 #define MAX_LM_SIZE 28672
-// One core has 32KB LM（gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
+// One core has 32KB LM（group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
 #define MAX_BATCH 512
 #define ALIGNMENT 64
--- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu
+++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu
@@ -8,7 +8,7 @@
 namespace xpu3 {
 namespace plugin {
 #define MAX_SM_SIZE 32768
-// One core has 32KB LM（gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
+// One core has 32KB LM（group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
 #define MAX_BATCH 512
 #define BANK_CONFLICT_M 128
--- a/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py
+++ b/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py
@@ -79,7 +79,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0])
 # print("wscale_pd:\n{}".format(wscale_pd))
 # print("wscale_np:\n{}".format(wscale_np))
-# comparation
+# comparison
 print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}")
 print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}")
 print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}")
--- a/docs/features/graph_optimization.md
+++ b/docs/features/graph_optimization.md
@@ -44,7 +44,7 @@ CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-c
 The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
 + `0`: Use Dynamic compute graph, default to 0
 + `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
-+ `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle  to compile and optimize
+ `2`: Base on Static compute graph, use the compiler(CINN, Compiler Infrastructure for Neural Networks) of Paddle  to compile and optimize
 In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
 For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
--- a/docs/get_started/installation/Enflame_gcu.md
+++ b/docs/get_started/installation/Enflame_gcu.md
@@ -62,7 +62,7 @@ python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/pac
 python -m pip install paddle-custom-gcu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/gcu/
 # For source compilation, refer to: https://github.com/PaddlePaddle/PaddleCustomDevice/blob/develop/backends/gcu/README_cn.md
 ```
-For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
+For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
 6. Install FastDeploy and dependencies
 ```bash
--- a/docs/usage/kunlunxin_xpu_deployment.md
+++ b/docs/usage/kunlunxin_xpu_deployment.md
@@ -89,4 +89,4 @@ for chunk in response:
 print('\n')
 ```
-For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
+For detailed OpenAI protocol specifications, see [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md).
--- a/docs/zh/features/sampling.md
+++ b/docs/zh/features/sampling.md
@@ -1,6 +1,6 @@
 # 采样策略
-采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Samping 多种采样策略。
+采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Sampling 多种采样策略。
 1. Top-p 采样
--- a/docs/zh/usage/kunlunxin_xpu_deployment.md
+++ b/docs/zh/usage/kunlunxin_xpu_deployment.md
@@ -89,4 +89,4 @@ for chunk in response:
 print('\n')
 ```
-OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)，以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
+OpenAI 协议的更多说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create)，以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。
--- a/fastdeploy/cache_manager/cache_transfer_manager.py
+++ b/fastdeploy/cache_manager/cache_transfer_manager.py
@@ -57,7 +57,7 @@ def parse_args():
        "--protocol",
        type=str,
        default="ipc",
-        help="cache transfer protocol, only surport ipc now",
+        help="cache transfer protocol, only support ipc now",
    )
    parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
    parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -257,7 +257,7 @@ class ParallelConfig:
        self.sequence_parallel = False  # Whether to enable sequence parallelism.
        self.use_ep = False  # Whether to enable Expert Parallelism
        self.moe_phase = MoEPhase("prefill")  # Generation phase
-        self.msg_queue_id = 1  # mesage queue id
+        self.msg_queue_id = 1  # message queue id
        self.tensor_parallel_rank = 0  # TP rank ID
        self.tensor_parallel_size = 1  # TP degree
@@ -549,7 +549,7 @@ class GraphOptimizationConfig:
            It requires that all input buffers have fixed addresses, and all
            splitting ops write their outputs to input buffers.
            - With dyncmic graph backend: ...
-            - With static grpah backend: WIP
+            - With static graph backend: WIP
        """
        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
        """  Number of warmup runs for SOT warmup. """
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -531,7 +531,7 @@ class EngineArgs:
            "--quantization",
            type=str,
            default=EngineArgs.quantization,
-            help="Quantization name for the model, currentlly support "
+            help="Quantization name for the model, currently support "
            "'wint8', 'wint4',"
            "default is None. The priority of this configuration "
            "is lower than that of the config file. "
@@ -829,7 +829,7 @@ class EngineArgs:
        scheduler_group.add_argument(
            "--scheduler-topic",
            default=EngineArgs.scheduler_topic,
-            help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)",
+            help=f"Topic of scheduler. Default is {EngineArgs.scheduler_topic}. (global)",
        )
        scheduler_group.add_argument(
            "--scheduler-min-load-score",
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -644,13 +644,13 @@ class EngineSevice:
                    self.zmq_server.send_multipart(request_id, [error_result])
            except Exception as e:
                llm_logger.error(
-                    f"Error happend while receving new request from zmq, details={e}, "
+                    f"Error happend while receiving new request from zmq, details={e}, "
                    f"traceback={traceback.format_exc()}"
                )
    def _zmq_send_generated_tokens(self):
        """
-        Recieve output for zmq
+        Receive output for zmq
        """
        while self.running:
            try:
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -458,7 +458,7 @@ class ResourceManagerV1(ResourceManager):
    def _free_blocks(self, request: Request):
        if self.config.cache_config.enable_prefix_caching:
-            # TODO(chengyanfu): support cache ouput blocks for prefix caching
+            # TODO(chengyanfu): support cache output blocks for prefix caching
            if request.get("prefill_block_num", None) is None:
                leaf_node = self.cache_manager.req_leaf_map[request.request_id]
                self.cache_manager.decrease_request_share_count(request.request_id)
--- a/fastdeploy/entrypoints/llm.py
+++ b/fastdeploy/entrypoints/llm.py
@@ -112,7 +112,7 @@ class LLM:
    def _receive_output(self):
        """
-        Recieve output from token processor and store them in cache
+        Receive output from token processor and store them in cache
        """
        while True:
            try:
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -40,7 +40,7 @@ class ConcreteSizeEntry:
    # Has runtime-bs been captured before
    captured: bool = False
-    # Need to be captured callable object（dynamic graph or static grpah backend）
+    # Need to be captured callable object（dynamic graph or static graph backend）
    runnable: Callable = None  # type: ignore
    # Number of completed warmups
    num_finished_warmup: int = 0
--- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
@@ -117,9 +117,9 @@ class GraphOptBackend:
        self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
        if self.fd_config.graph_opt_config.graph_opt_level > 0:
-            # 1. Prepare cuda grpah input buffers (contain output of subgraphs)
+            # 1. Prepare cuda graph input buffers (contain output of subgraphs)
-            # 2. Convert dynamic grpah to static graph
+            # 2. Convert dynamic graph to static graph
            backend = (
                ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -193,7 +193,7 @@ class AppendAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
--- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py
@@ -114,7 +114,7 @@ class BlockAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -176,7 +176,7 @@ class FlashAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
--- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
@@ -210,7 +210,7 @@ class IluvatarAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        return (
            max_num_blocks,
--- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py
@@ -130,7 +130,7 @@ class XPUAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ) -> Tuple[int, int, int, int]:
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        return (
            max_num_blocks,
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py
@@ -170,7 +170,7 @@ class GCUFlashAttnBackend(AttentionBackend):
                cache_len = 0
            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
-            # else:  doesnot have req in this seq_idx
+            # else:  doesn't have req in this seq_idx
            if cache_len is not None:
                lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -212,7 +212,7 @@ class GCUFlashAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
        return (
--- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py
@@ -171,7 +171,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
                cache_len = 0
            elif self.seq_lens_decoder_list[seq_idx][0] != 0:  # decode
                cache_len = self.seq_lens_decoder_list[seq_idx][0]
-            # else:  doesnot have req in this seq_idx
+            # else:  doesn't have req in this seq_idx
            if cache_len is not None:
                lens_this_time = self.seq_lens_this_time_list[seq_idx]
@@ -224,7 +224,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        # [total_tokens, kv_num_heads, head_dim]
        return (
--- a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
@@ -137,7 +137,7 @@ class FlashAttentionBackend(AttentionBackend):
        kv_cache_quant_type: str = None,
    ):
        """
-        Caculate kv cache shape
+        Calculate kv cache shape
        """
        if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp":
            return (
--- a/fastdeploy/model_executor/layers/moe/ep.py
+++ b/fastdeploy/model_executor/layers/moe/ep.py
@@ -114,7 +114,7 @@ class DeepEPEngine:
                low_latency_mode=True,
                num_qps_per_rank=24,
            )
-        # In disaggregated mode on mutiple nodes, we either use
+        # In disaggregated mode on multiple nodes, we either use
        # high throughput mode or low latency mode.
        else:
            if moe_phase.phase == "decode":
--- a/fastdeploy/model_executor/layers/sample/early_stopper.py
+++ b/fastdeploy/model_executor/layers/sample/early_stopper.py
@@ -35,7 +35,7 @@ class EarlyStopper:
    @abstractmethod
    def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor):
        """
-        processs the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
+        process the stopper and set the stop_flags corresponding to the batch that triggers early stop to True
        args:
            - probs: [batch_size, vocab_size], the probs of every sample
            - next_tokens: [batch_size, 1], the token index of every chosen sample
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -267,7 +267,7 @@ class TokenProcessor:
            spec_logger.info(
                f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
                f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
-                f" avarage accept len: {self.number_of_output_tokens / self.total_step}"
+                f" average accept len: {self.number_of_output_tokens / self.total_step}"
            )
            if self.cfg.speculative_config.method in ["mtp"]:
--- a/fastdeploy/spec_decode/base.py
+++ b/fastdeploy/spec_decode/base.py
@@ -72,7 +72,7 @@ class Proposer(ABC):
    @abstractmethod
    def _run_impl(self, *args, **kwargs) -> Any:
        """
-        Implemention for different method
+        Implementation for different method
        """
        raise NotImplementedError
--- a/fastdeploy/worker/experts_manager.py
+++ b/fastdeploy/worker/experts_manager.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """
-"""redundant expert manger."""
+"""redundant expert manager."""
 from typing import Optional, Tuple
 import numpy as np
--- a/fastdeploy/worker/gcu_worker.py
+++ b/fastdeploy/worker/gcu_worker.py
@@ -49,7 +49,7 @@ class GcuWorker(WorkerBase):
    def init_device(self):
        """Initialize device and Construct model runner"""
        if paddle.is_compiled_with_custom_device("gcu"):
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"gcu:{self.local_rank}"
            paddle.device.set_device(self.device)
@@ -127,7 +127,7 @@ class GcuWorker(WorkerBase):
        # NOTE(gongshaotian): may be not need warm_up at this place
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # 2. Triger cuda grpah capture
+        # 2. Trigger cuda graph capture
        self.model_runner.capture_model()
        set_random_seed(self.fd_config.model_config.seed)
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -60,7 +60,7 @@ class GpuWorker(WorkerBase):
        """
        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
        if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda():
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"gpu:{self.local_rank % self.max_chips_per_node}"
            paddle.device.set_device(self.device)
@@ -169,7 +169,7 @@ class GpuWorker(WorkerBase):
            )
        )
-        return available_kv_cache_memory  # return to caculate the block num in this device
+        return available_kv_cache_memory  # return to calculate the block num in this device
    def load_model(self) -> None:
        """Load model"""
@@ -209,7 +209,7 @@ class GpuWorker(WorkerBase):
        """
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # Triger cuda grpah capture
+        # Trigger cuda graph capture
        self.model_runner.capture_model()
    def check_health(self) -> bool:
--- a/fastdeploy/worker/iluvatar_worker.py
+++ b/fastdeploy/worker/iluvatar_worker.py
@@ -51,7 +51,7 @@ class IluvatarWorker(GpuWorker):
        Initialize device and construct model runner
        """
        if paddle.is_compiled_with_custom_device("iluvatar_gpu"):
-            # Set evironment variable
+            # Set environment variable
            self.device = f"iluvatar_gpu:{self.local_rank}"
            paddle.device.set_device(self.device)
            paddle.set_default_dtype(self.parallel_config.dtype)
--- a/fastdeploy/worker/metax_worker.py
+++ b/fastdeploy/worker/metax_worker.py
@@ -54,7 +54,7 @@ class MetaxWorker(WorkerBase):
        """
        self.max_chips_per_node = 8
        if paddle.is_compiled_with_custom_device("metax_gpu"):
-            # Set evironment variable
+            # Set environment variable
            self.device_ids = self.parallel_config.device_ids.split(",")
            self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}"
            paddle.device.set_device(self.device)
@@ -202,7 +202,7 @@ class MetaxWorker(WorkerBase):
        """
        if self.model_runner.graph_opt_level >= 1:
            self.model_runner.sot_warmup()
-        # Todo Triger cuda grpah capture.
+        # Todo Trigger cuda graph capture.
    def check_health(self) -> bool:
        """ """
--- a/fastdeploy/worker/utils.py
+++ b/fastdeploy/worker/utils.py
@@ -21,7 +21,7 @@ import traceback
 def check_safetensors_model(model_dir: str):
    """
    model_dir : the directory of the model
-    Check whther the model is safetensors format
+    Check whether the model is safetensors format
    """
    model_files = list()
    all_files = os.listdir(model_dir)
--- a/fastdeploy/worker/worker_base.py
+++ b/fastdeploy/worker/worker_base.py
@@ -27,7 +27,7 @@ from fastdeploy.worker.output import ModelRunnerOutput
 class WorkerBase(ABC):
    """
    Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
-    Worker interface that allows inference framwork to cleanly separate implementations for different harware.
+    Worker interface that allows inference framework to cleanly separate implementations for different hardware.
    """
    def __init__(
@@ -89,7 +89,7 @@ class WorkerBase(ABC):
    @abstractmethod
    def graph_optimize_and_warm_up_model(self) -> None:
-        """Prepare model for execution through grpah optimizaiton(CudaGrpah/CINN) or warmup."""
+        """Prepare model for execution through graph optimizaiton(CudaGrpah/CINN) or warmup."""
        raise NotImplementedError
    @abstractmethod
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -249,7 +249,7 @@ class PaddleDisWorkerProc:
        )
    def event_loop_normal(self) -> None:
-        """Main event loop for Paddle Distrubuted Workers.
+        """Main event loop for Paddle Distributed Workers.
        TODO(gongshaotian): support remote calling of functions that control worker.
        """
        # Currently, only support single node
@@ -493,7 +493,7 @@ def parse_args():
        "--speculative_config",
        type=json.loads,
        default=None,
-        help="Configation of SpeculativeConfig.",
+        help="Configuration of SpeculativeConfig.",
    )
    parser.add_argument(
        "--max_num_batched_tokens",
@@ -542,7 +542,7 @@ def parse_args():
        "--quantization",
        type=str,
        default="None",
-        help="Quantization name for the model, currentlly support "
+        help="Quantization name for the model, currently support "
        "'wint4', 'wint8',"
        "default is None. The priority of this configuration "
        "is lower than that of the config file. "
@@ -552,7 +552,7 @@ def parse_args():
        "--graph_optimization_config",
        type=json.loads,
        default=None,
-        help="Configation of Graph optimization backend.",
+        help="Configuration of Graph optimization backend.",
    )
    parser.add_argument(
        "--moba_attention_config",
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -50,7 +50,7 @@ class XpuWorker(WorkerBase):
    def init_device(self):
        """Initialize device and Construct model runner"""
        if paddle.is_compiled_with_xpu():
-            # Set evironment variable
+            # Set environment variable
            self.device = f"xpu:{self.local_rank}"
            paddle.device.set_device(self.device)
            paddle.set_default_dtype(self.parallel_config.dtype)
--- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
+++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
@@ -107,7 +107,7 @@ class TestModel1(paddle.nn.Layer):
        sub_meta1 = forward_meta
        sublayer1_output = self.sublayer1(ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1)
-        # sublayer2 not use cuda garph
+        # sublayer2 not use cuda graph
        sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
        sublayer2_output = self.sublayer2(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)
        self.sublayer2_output_buffer.copy_(sublayer2_output, False)
@@ -131,7 +131,7 @@ class TestModel1(paddle.nn.Layer):
            ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1
        )
-        # sublayer2 not use cuda garph
+        # sublayer2 not use cuda graph
        sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output)
        sublayer2_output = self.sublayer2.forward_correct(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)