diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 027a33dc0..f20243bd8 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -980,7 +980,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"), py::arg("block_size"), - "per token per block quant and padding tranpose scale"); + "per token per block quant and padding transpose scale"); m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"), py::arg("recv_expert_count"), py::arg("block_size"), diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h index 4b7d3ac06..bc91e724b 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_base.h @@ -89,11 +89,11 @@ public: GemmShape; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); - /// Number of warp-level GEMM oeprations per load for B + /// Number of warp-level GEMM operations per load for B static constexpr int kWarpGemmIterationsPerLoadForB = Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK; static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), ""); diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h index 743b6c70a..9dfddf83f 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_gelu_and_mul.h @@ -117,7 +117,7 @@ class LeftGELUAndMul { CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &lhs, FragmentAccumulator const &rhs) const { - // Convert source to interal compute numeric type + // Convert source to internal compute numeric type NumericArrayConverter accumulator_to_compute; diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h index 7c1213c7e..7a433bccd 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/thread/left_silu_and_mul.h @@ -117,7 +117,7 @@ class LeftSiLUAndMul { CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &lhs, FragmentAccumulator const &rhs) const { - // Convert source to interal compute numeric type + // Convert source to internal compute numeric type NumericArrayConverter accumulator_to_compute; diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h index 2d6fbc1fd..530bf5665 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/dual_gemm/threadblock/dual_mma_base.h @@ -92,7 +92,7 @@ class DualMmaBase { Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator0::Policy::MmaShape::kK); diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h index 1d7abaabd..457416d75 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_per_row_per_col_scale_nf4.h @@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 { iterator_C_.clear_mask(); } // NOTE(wangbojun) Currently, this kernel don't hanve implantention for - // adding elementwise beta, we keep this here for future useage beta_ = + // adding elementwise beta, we keep this here for future usage beta_ = // (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : // params.elementwise.beta); if (beta_ == ElementAccumulator()) { // iterator_C_.clear_mask(); diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h index 40da912df..0a3c107f1 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/epilogue/threadblock/epilogue_tensor_op_int32.h @@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp class SharedLoadIteratorMixed { public: diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h index 107251cd8..540c9a093 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/default_mma_nf4_int8_interleaved.h @@ -64,7 +64,7 @@ template < typename InstructionShape_, /// Number of stages used in the pipelined mainloop int Stages, - /// Operation perfomed by GEMM + /// Operation performed by GEMM typename Operator, /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h index 9648deb56..ac4719044 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_base.h @@ -133,7 +133,7 @@ public: /// Shape describing the number of warps filling the CTA using WarpCount = GemmShape; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,""); static constexpr int kNumKIterationsPerWarpBLoad = diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h index 218b33c5a..3021605f2 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/int8_mma_multistage.h @@ -509,7 +509,7 @@ public: this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]); ++this->warp_tile_iterator_B_; } - // TOOD(wangbojun) lds_converter can be remove for int8 B input + // TODO(wangbojun) lds_converter can be remove for int8 B input typename TransformBAfterLDS::result_type converted_frag_B = lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]); diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h index 02c03c707..413e53a9a 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_base.h @@ -96,7 +96,7 @@ public: /// Shape describing the number of warps filling the CTA using WarpCount = GemmShape; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,""); static constexpr int kNumKIterationsPerWarpBLoad = diff --git a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h index e7e204620..d124b09cb 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h +++ b/custom_ops/gpu_ops/cutlass_kernels/w4a8_moe/cutlass_extensions/gemm/threadblock/nf4_int8_mma_multistage.h @@ -646,7 +646,7 @@ public: // ); // } } - // TOOD(wangbojun) lds_converter can be remove for int8 B input + // TODO(wangbojun) lds_converter can be remove for int8 B input // int4 // typename TransformBAfterLDS::result_type converted_frag_B = // lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]); diff --git a/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h b/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h index a076e0c51..776ccce14 100644 --- a/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h +++ b/custom_ops/gpu_ops/int8_gemm_with_cutlass/epilogue_tensor_op_int32.h @@ -171,7 +171,7 @@ struct DefaultIteratorsTensorOp class SharedLoadIteratorMixed { public: diff --git a/custom_ops/gpu_ops/moe/moe_dispatch.cu b/custom_ops/gpu_ops/moe/moe_dispatch.cu index 85bad95cd..d42b9f36b 100644 --- a/custom_ops/gpu_ops/moe/moe_dispatch.cu +++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu @@ -80,7 +80,7 @@ void MoeDispatchKernel( if (group_moe) { paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place); - // (TODO: check fill sucess ?) + // (TODO: check fill success ?) paddle::experimental::fill(softmax_max_prob_tensor, 0.f); softmax_max_prob = softmax_max_prob_tensor.data(); } diff --git a/custom_ops/gpu_ops/save_output_msg_with_topk.cc b/custom_ops/gpu_ops/save_output_msg_with_topk.cc index a9bf763b9..be8cab4c0 100644 --- a/custom_ops/gpu_ops/save_output_msg_with_topk.cc +++ b/custom_ops/gpu_ops/save_output_msg_with_topk.cc @@ -75,7 +75,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x, std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/gpu_ops/save_with_output_msg.cc b/custom_ops/gpu_ops/save_with_output_msg.cc index 45d0ac7fb..261300f21 100644 --- a/custom_ops/gpu_ops/save_with_output_msg.cc +++ b/custom_ops/gpu_ops/save_with_output_msg.cc @@ -45,7 +45,7 @@ void save_kernel(const paddle::Tensor& x, std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/gpu_ops/set_value_by_flags.cu b/custom_ops/gpu_ops/set_value_by_flags.cu index 38d2ea045..9e7a0ce11 100644 --- a/custom_ops/gpu_ops/set_value_by_flags.cu +++ b/custom_ops/gpu_ops/set_value_by_flags.cu @@ -34,7 +34,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags, const int64_t *input_ids_now = input_ids + tid * length_input_ids; const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; - if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped + if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped if (step_idx[tid] >= 0) { if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1]; diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu index 8ae4bac75..43501cdbd 100644 --- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu +++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu @@ -63,7 +63,7 @@ __global__ void ComputeOrderKernel( position_map[in_offset++] = out_offset++; } in_offset += cur_base_model_seq_lens_this_time - accept_num; -// (liuzichang): Temperary Reserved for debug +// (liuzichang): Temporary Reserved for debug // if (accept_num <= actual_draft_token_num) /*Accept partial draft tokens*/ { // #ifdef DEBUG_EAGLE_KERNEL // printf("batch %d: accept_num <= actual_draft_token_num \n", i); diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu index cacbd1387..4b1c7747e 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu @@ -35,7 +35,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, accept_tokens + tid * max_draft_tokens; const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; - if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped + if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped // printf("step_idx[tid] %d\n", step_idx[tid]); if (step_idx[tid] >= 0) { for (int i = 0; i < accept_num[tid]; i++) { diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu index baf1da9e1..853f894e0 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu @@ -295,7 +295,7 @@ void SpeculateStepSchedule(const paddle::Tensor &stop_flags, std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/gpu_ops/step_reschedule.cu b/custom_ops/gpu_ops/step_reschedule.cu index bb7062d19..8475b317e 100644 --- a/custom_ops/gpu_ops/step_reschedule.cu +++ b/custom_ops/gpu_ops/step_reschedule.cu @@ -283,7 +283,7 @@ void Schedule(const paddle::Tensor &stop_flags, std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/gpu_ops/token_transfer.hpp b/custom_ops/gpu_ops/token_transfer.hpp index a7a5dd164..edfa07e99 100644 --- a/custom_ops/gpu_ops/token_transfer.hpp +++ b/custom_ops/gpu_ops/token_transfer.hpp @@ -58,7 +58,7 @@ class TokenTransfer { } // once copy: cpu --> cpu - // arrary length should be (1 + MAX_BATCH) + // array length should be (1 + MAX_BATCH) bool GetBatchToken(int64_t *array) { if (Empty()) { return false; diff --git a/custom_ops/gpu_ops/update_split_fuse_input.cu b/custom_ops/gpu_ops/update_split_fuse_input.cu index 0d9c80488..17bacb013 100644 --- a/custom_ops/gpu_ops/update_split_fuse_input.cu +++ b/custom_ops/gpu_ops/update_split_fuse_input.cu @@ -75,10 +75,10 @@ void UpdateSplitFuseInputes(const paddle::Tensor& split_fuse_seq_lens, const int max_seq_len, const int max_batch_size, const int split_fuse_size) { - dim3 girds; - girds.x = max_batch_size; + dim3 grids; + grids.x = max_batch_size; const int block_size = 128; - update_split_fuse_inputs_kernel<<>>( diff --git a/custom_ops/iluvatar_ops/moe_dispatch.cu b/custom_ops/iluvatar_ops/moe_dispatch.cu index a6195f44e..ee0a80871 100644 --- a/custom_ops/iluvatar_ops/moe_dispatch.cu +++ b/custom_ops/iluvatar_ops/moe_dispatch.cu @@ -110,7 +110,7 @@ void MoeDispatchKernel(const paddle::Tensor& input, if (group_moe) { paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place); - // (TODO: check fill sucess ?) + // (TODO: check fill success ?) paddle::experimental::fill(softmax_max_prob_tensor, 0.f); softmax_max_prob = softmax_max_prob_tensor.data(); } diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index a0757d180..6c24a25f9 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -507,7 +507,7 @@ elif paddle.is_compiled_with_cuda(): sources += find_end_files(fp8_auto_gen_directory, ".cu") if cc >= 90 and nvcc_version >= 12.0: - # Hopper optmized mla + # Hopper optimized mla sources += find_end_files("gpu_ops/mla_attn", ".cu") sources += ["gpu_ops/flash_mask_attn/flash_mask_attn.cu"] sources += find_end_files("gpu_ops/moba_attn/moba_decoder_attn/", ".cu") diff --git a/custom_ops/xpu_ops/src/ops/moe_layer.cc b/custom_ops/xpu_ops/src/ops/moe_layer.cc index 70f4fac52..c924a1735 100644 --- a/custom_ops/xpu_ops/src/ops/moe_layer.cc +++ b/custom_ops/xpu_ops/src/ops/moe_layer.cc @@ -67,7 +67,7 @@ std::vector MoeLayerKernel( const auto xtype = x.dtype(); auto x_dims = x.shape(); auto up_gate_proj_dims = up_gate_proj_weight.shape(); - PD_CHECK(x_dims.size() == 2, "x_dims.size() shoud be 2."); + PD_CHECK(x_dims.size() == 2, "x_dims.size() should be 2."); PD_CHECK(up_gate_proj_dims.size() == 3, "up_gate_proj_dims.size() should be 3."); PD_CHECK(down_proj_in_scale.get_ptr() == nullptr, "down_proj_in_scale not support."); if (quant_method == "weight_only_int4") { diff --git a/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc b/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc index bc3675d4c..fb150bebc 100644 --- a/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc +++ b/custom_ops/xpu_ops/src/ops/mtp_ops/speculate_step_reschedule.cc @@ -122,7 +122,7 @@ void SpeculateStepSchedule( std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc b/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc index 28b9f1935..fd132a775 100644 --- a/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc +++ b/custom_ops/xpu_ops/src/ops/save_with_output_msg.cc @@ -59,7 +59,7 @@ void SaveOutMmsg(const paddle::Tensor &x, const paddle::Tensor ¬_need_stop, std::string inference_msg_id_env_str(inference_msg_id_env_p); inference_msg_id_from_env = std::stoi(inference_msg_id_env_str); if (inference_msg_id_from_env == 2) { - // 2 and -2 is perserve for no-output indication. + // 2 and -2 is preserve for no-output indication. throw std::runtime_error( " INFERENCE_MSG_ID cannot be 2, please use other number."); } diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu index be55e49b8..b675785a4 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_adjust_batch.xpu @@ -4,7 +4,7 @@ namespace xpu3 { namespace plugin { #define MAX_LM_SIZE 28672 -// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is +// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is // the stack space #define MAX_BATCH 512 #define ALIGNMENT 64 diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu index b8ace9128..d2ef40cd5 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu @@ -4,7 +4,7 @@ namespace xpu3 { namespace plugin { #define MAX_LM_SIZE 28672 -// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is +// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is // the stack space #define MAX_BATCH 512 #define ALIGNMENT 64 diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu index 958075c1c..43daf966c 100644 --- a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu +++ b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/quant2d_per_channel.xpu @@ -8,7 +8,7 @@ namespace xpu3 { namespace plugin { #define MAX_SM_SIZE 32768 -// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is +// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is // the stack space #define MAX_BATCH 512 #define BANK_CONFLICT_M 128 diff --git a/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py b/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py index 59312c95d..04e6bc665 100644 --- a/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py +++ b/custom_ops/xpu_ops/test/test_weight_quantize_xpu.py @@ -79,7 +79,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0]) # print("wscale_pd:\n{}".format(wscale_pd)) # print("wscale_np:\n{}".format(wscale_np)) -# comparation +# comparison print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}") print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}") print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}") diff --git a/docs/features/graph_optimization.md b/docs/features/graph_optimization.md index ff335b66b..09d93f105 100644 --- a/docs/features/graph_optimization.md +++ b/docs/features/graph_optimization.md @@ -44,7 +44,7 @@ CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-c The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options: + `0`: Use Dynamic compute graph, default to 0 + `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image -+ `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize ++ `2`: Base on Static compute graph, use the compiler(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs. For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously. diff --git a/docs/get_started/installation/Enflame_gcu.md b/docs/get_started/installation/Enflame_gcu.md index e443a7ce3..1801cf6d0 100644 --- a/docs/get_started/installation/Enflame_gcu.md +++ b/docs/get_started/installation/Enflame_gcu.md @@ -62,7 +62,7 @@ python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/pac python -m pip install paddle-custom-gcu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/gcu/ # For source compilation, refer to: https://github.com/PaddlePaddle/PaddleCustomDevice/blob/develop/backends/gcu/README_cn.md ``` -For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) +For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) 6. Install FastDeploy and dependencies ```bash diff --git a/docs/usage/kunlunxin_xpu_deployment.md b/docs/usage/kunlunxin_xpu_deployment.md index 1096db339..04385d3d1 100644 --- a/docs/usage/kunlunxin_xpu_deployment.md +++ b/docs/usage/kunlunxin_xpu_deployment.md @@ -89,4 +89,4 @@ for chunk in response: print('\n') ``` -For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md). +For detailed OpenAI protocol specifications, see [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../online_serving/README.md). diff --git a/docs/zh/features/sampling.md b/docs/zh/features/sampling.md index 51464515d..df04d1c28 100644 --- a/docs/zh/features/sampling.md +++ b/docs/zh/features/sampling.md @@ -1,6 +1,6 @@ # 采样策略 -采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Samping 多种采样策略。 +采样策略用于决定如何从模型的输出概率分布中选择下一个token。FastDeploy目前支持 Top-p 、 Top-k_Top-p 和 Min-p Sampling 多种采样策略。 1. Top-p 采样 diff --git a/docs/zh/usage/kunlunxin_xpu_deployment.md b/docs/zh/usage/kunlunxin_xpu_deployment.md index b89481401..94e598afc 100644 --- a/docs/zh/usage/kunlunxin_xpu_deployment.md +++ b/docs/zh/usage/kunlunxin_xpu_deployment.md @@ -89,4 +89,4 @@ for chunk in response: print('\n') ``` -OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。 +OpenAI 协议的更多说明可参考文档 [OpenAI Chat Completion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../online_serving/README.md)。 diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index 5078a513d..cb793df44 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -57,7 +57,7 @@ def parse_args(): "--protocol", type=str, default="ipc", - help="cache transfer protocol, only surport ipc now", + help="cache transfer protocol, only support ipc now", ) parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ") parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port") diff --git a/fastdeploy/config.py b/fastdeploy/config.py index e4182e6c9..2c0efd277 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -257,7 +257,7 @@ class ParallelConfig: self.sequence_parallel = False # Whether to enable sequence parallelism. self.use_ep = False # Whether to enable Expert Parallelism self.moe_phase = MoEPhase("prefill") # Generation phase - self.msg_queue_id = 1 # mesage queue id + self.msg_queue_id = 1 # message queue id self.tensor_parallel_rank = 0 # TP rank ID self.tensor_parallel_size = 1 # TP degree @@ -549,7 +549,7 @@ class GraphOptimizationConfig: It requires that all input buffers have fixed addresses, and all splitting ops write their outputs to input buffers. - With dyncmic graph backend: ... - - With static grpah backend: WIP + - With static graph backend: WIP """ self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128] """ Number of warmup runs for SOT warmup. """ diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 10ed83525..bfaabd981 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -531,7 +531,7 @@ class EngineArgs: "--quantization", type=str, default=EngineArgs.quantization, - help="Quantization name for the model, currentlly support " + help="Quantization name for the model, currently support " "'wint8', 'wint4'," "default is None. The priority of this configuration " "is lower than that of the config file. " @@ -829,7 +829,7 @@ class EngineArgs: scheduler_group.add_argument( "--scheduler-topic", default=EngineArgs.scheduler_topic, - help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)", + help=f"Topic of scheduler. Default is {EngineArgs.scheduler_topic}. (global)", ) scheduler_group.add_argument( "--scheduler-min-load-score", diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index ea00ea857..62442d8cb 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -644,13 +644,13 @@ class EngineSevice: self.zmq_server.send_multipart(request_id, [error_result]) except Exception as e: llm_logger.error( - f"Error happend while receving new request from zmq, details={e}, " + f"Error happend while receiving new request from zmq, details={e}, " f"traceback={traceback.format_exc()}" ) def _zmq_send_generated_tokens(self): """ - Recieve output for zmq + Receive output for zmq """ while self.running: try: diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 734d99a5f..5b69e610a 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -458,7 +458,7 @@ class ResourceManagerV1(ResourceManager): def _free_blocks(self, request: Request): if self.config.cache_config.enable_prefix_caching: - # TODO(chengyanfu): support cache ouput blocks for prefix caching + # TODO(chengyanfu): support cache output blocks for prefix caching if request.get("prefill_block_num", None) is None: leaf_node = self.cache_manager.req_leaf_map[request.request_id] self.cache_manager.decrease_request_share_count(request.request_id) diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 8d88ea3d7..f9537e557 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -112,7 +112,7 @@ class LLM: def _receive_output(self): """ - Recieve output from token processor and store them in cache + Receive output from token processor and store them in cache """ while True: try: diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 30a28d293..659eba7b6 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -40,7 +40,7 @@ class ConcreteSizeEntry: # Has runtime-bs been captured before captured: bool = False - # Need to be captured callable object(dynamic graph or static grpah backend) + # Need to be captured callable object(dynamic graph or static graph backend) runnable: Callable = None # type: ignore # Number of completed warmups num_finished_warmup: int = 0 diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index e843753e8..5ebc82fb1 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -117,9 +117,9 @@ class GraphOptBackend: self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0] if self.fd_config.graph_opt_config.graph_opt_level > 0: - # 1. Prepare cuda grpah input buffers (contain output of subgraphs) + # 1. Prepare cuda graph input buffers (contain output of subgraphs) - # 2. Convert dynamic grpah to static graph + # 2. Convert dynamic graph to static graph backend = ( ToStaticBackend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else ToStaticBackend.PHI diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 29d570e23..59fe071af 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -193,7 +193,7 @@ class AppendAttentionBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": return ( diff --git a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py index f5800d156..418876271 100644 --- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py @@ -114,7 +114,7 @@ class BlockAttentionBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": return ( diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index b7d8c828b..193a31ff5 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -176,7 +176,7 @@ class FlashAttentionBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": return ( diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index b6064a5de..abe485094 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -210,7 +210,7 @@ class IluvatarAttnBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ return ( max_num_blocks, diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 45ae75184..938693738 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -130,7 +130,7 @@ class XPUAttentionBackend(AttentionBackend): kv_cache_quant_type: str = None, ) -> Tuple[int, int, int, int]: """ - Caculate kv cache shape + Calculate kv cache shape """ return ( max_num_blocks, diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index 4f94e561a..a77c2f255 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -170,7 +170,7 @@ class GCUFlashAttnBackend(AttentionBackend): cache_len = 0 elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode cache_len = self.seq_lens_decoder_list[seq_idx][0] - # else: doesnot have req in this seq_idx + # else: doesn't have req in this seq_idx if cache_len is not None: lens_this_time = self.seq_lens_this_time_list[seq_idx] @@ -212,7 +212,7 @@ class GCUFlashAttnBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ # [total_tokens, kv_num_heads, head_dim] return ( diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index 8ecd1b4be..6af54ee9a 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -171,7 +171,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): cache_len = 0 elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode cache_len = self.seq_lens_decoder_list[seq_idx][0] - # else: doesnot have req in this seq_idx + # else: doesn't have req in this seq_idx if cache_len is not None: lens_this_time = self.seq_lens_this_time_list[seq_idx] @@ -224,7 +224,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ # [total_tokens, kv_num_heads, head_dim] return ( diff --git a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py index fc7336e75..790e989f2 100644 --- a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py @@ -137,7 +137,7 @@ class FlashAttentionBackend(AttentionBackend): kv_cache_quant_type: str = None, ): """ - Caculate kv cache shape + Calculate kv cache shape """ if kv_cache_quant_type is not None and kv_cache_quant_type == "int4_zp": return ( diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index 9659aec7d..e3791b700 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -114,7 +114,7 @@ class DeepEPEngine: low_latency_mode=True, num_qps_per_rank=24, ) - # In disaggregated mode on mutiple nodes, we either use + # In disaggregated mode on multiple nodes, we either use # high throughput mode or low latency mode. else: if moe_phase.phase == "decode": diff --git a/fastdeploy/model_executor/layers/sample/early_stopper.py b/fastdeploy/model_executor/layers/sample/early_stopper.py index 5f0a24888..683455771 100644 --- a/fastdeploy/model_executor/layers/sample/early_stopper.py +++ b/fastdeploy/model_executor/layers/sample/early_stopper.py @@ -35,7 +35,7 @@ class EarlyStopper: @abstractmethod def process(self, probs: paddle.Tensor, next_tokens: paddle.Tensor, stop_flags: paddle.Tensor): """ - processs the stopper and set the stop_flags corresponding to the batch that triggers early stop to True + process the stopper and set the stop_flags corresponding to the batch that triggers early stop to True args: - probs: [batch_size, vocab_size], the probs of every sample - next_tokens: [batch_size, 1], the token index of every chosen sample diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 1f9ba002c..cc78c3342 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -267,7 +267,7 @@ class TokenProcessor: spec_logger.info( f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}" f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}" - f" avarage accept len: {self.number_of_output_tokens / self.total_step}" + f" average accept len: {self.number_of_output_tokens / self.total_step}" ) if self.cfg.speculative_config.method in ["mtp"]: diff --git a/fastdeploy/spec_decode/base.py b/fastdeploy/spec_decode/base.py index 1719b7f26..114bcc00c 100644 --- a/fastdeploy/spec_decode/base.py +++ b/fastdeploy/spec_decode/base.py @@ -72,7 +72,7 @@ class Proposer(ABC): @abstractmethod def _run_impl(self, *args, **kwargs) -> Any: """ - Implemention for different method + Implementation for different method """ raise NotImplementedError diff --git a/fastdeploy/worker/experts_manager.py b/fastdeploy/worker/experts_manager.py index 0e7fd726c..4f6e4fe92 100644 --- a/fastdeploy/worker/experts_manager.py +++ b/fastdeploy/worker/experts_manager.py @@ -14,7 +14,7 @@ # limitations under the License. """ -"""redundant expert manger.""" +"""redundant expert manager.""" from typing import Optional, Tuple import numpy as np diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py index 54b4fa7e9..52d43f454 100644 --- a/fastdeploy/worker/gcu_worker.py +++ b/fastdeploy/worker/gcu_worker.py @@ -49,7 +49,7 @@ class GcuWorker(WorkerBase): def init_device(self): """Initialize device and Construct model runner""" if paddle.is_compiled_with_custom_device("gcu"): - # Set evironment variable + # Set environment variable self.device_ids = self.parallel_config.device_ids.split(",") self.device = f"gcu:{self.local_rank}" paddle.device.set_device(self.device) @@ -127,7 +127,7 @@ class GcuWorker(WorkerBase): # NOTE(gongshaotian): may be not need warm_up at this place if self.model_runner.graph_opt_level >= 1: self.model_runner.sot_warmup() - # 2. Triger cuda grpah capture + # 2. Trigger cuda graph capture self.model_runner.capture_model() set_random_seed(self.fd_config.model_config.seed) diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index e7b1adb4b..1bd0107d5 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -60,7 +60,7 @@ class GpuWorker(WorkerBase): """ self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda(): - # Set evironment variable + # Set environment variable self.device_ids = self.parallel_config.device_ids.split(",") self.device = f"gpu:{self.local_rank % self.max_chips_per_node}" paddle.device.set_device(self.device) @@ -169,7 +169,7 @@ class GpuWorker(WorkerBase): ) ) - return available_kv_cache_memory # return to caculate the block num in this device + return available_kv_cache_memory # return to calculate the block num in this device def load_model(self) -> None: """Load model""" @@ -209,7 +209,7 @@ class GpuWorker(WorkerBase): """ if self.model_runner.graph_opt_level >= 1: self.model_runner.sot_warmup() - # Triger cuda grpah capture + # Trigger cuda graph capture self.model_runner.capture_model() def check_health(self) -> bool: diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index f8e740cc4..c1b060588 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -51,7 +51,7 @@ class IluvatarWorker(GpuWorker): Initialize device and construct model runner """ if paddle.is_compiled_with_custom_device("iluvatar_gpu"): - # Set evironment variable + # Set environment variable self.device = f"iluvatar_gpu:{self.local_rank}" paddle.device.set_device(self.device) paddle.set_default_dtype(self.parallel_config.dtype) diff --git a/fastdeploy/worker/metax_worker.py b/fastdeploy/worker/metax_worker.py index 92ed6f7fd..fdf7a349b 100644 --- a/fastdeploy/worker/metax_worker.py +++ b/fastdeploy/worker/metax_worker.py @@ -54,7 +54,7 @@ class MetaxWorker(WorkerBase): """ self.max_chips_per_node = 8 if paddle.is_compiled_with_custom_device("metax_gpu"): - # Set evironment variable + # Set environment variable self.device_ids = self.parallel_config.device_ids.split(",") self.device = f"metax_gpu:{self.local_rank % self.max_chips_per_node}" paddle.device.set_device(self.device) @@ -202,7 +202,7 @@ class MetaxWorker(WorkerBase): """ if self.model_runner.graph_opt_level >= 1: self.model_runner.sot_warmup() - # Todo Triger cuda grpah capture. + # Todo Trigger cuda graph capture. def check_health(self) -> bool: """ """ diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py index 7554c7c08..7a2562f24 100644 --- a/fastdeploy/worker/utils.py +++ b/fastdeploy/worker/utils.py @@ -21,7 +21,7 @@ import traceback def check_safetensors_model(model_dir: str): """ model_dir : the directory of the model - Check whther the model is safetensors format + Check whether the model is safetensors format """ model_files = list() all_files = os.listdir(model_dir) diff --git a/fastdeploy/worker/worker_base.py b/fastdeploy/worker/worker_base.py index 30bd39e26..136327e2a 100644 --- a/fastdeploy/worker/worker_base.py +++ b/fastdeploy/worker/worker_base.py @@ -27,7 +27,7 @@ from fastdeploy.worker.output import ModelRunnerOutput class WorkerBase(ABC): """ Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model - Worker interface that allows inference framwork to cleanly separate implementations for different harware. + Worker interface that allows inference framework to cleanly separate implementations for different hardware. """ def __init__( @@ -89,7 +89,7 @@ class WorkerBase(ABC): @abstractmethod def graph_optimize_and_warm_up_model(self) -> None: - """Prepare model for execution through grpah optimizaiton(CudaGrpah/CINN) or warmup.""" + """Prepare model for execution through graph optimizaiton(CudaGrpah/CINN) or warmup.""" raise NotImplementedError @abstractmethod diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index d2ab0db99..a57195391 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -249,7 +249,7 @@ class PaddleDisWorkerProc: ) def event_loop_normal(self) -> None: - """Main event loop for Paddle Distrubuted Workers. + """Main event loop for Paddle Distributed Workers. TODO(gongshaotian): support remote calling of functions that control worker. """ # Currently, only support single node @@ -493,7 +493,7 @@ def parse_args(): "--speculative_config", type=json.loads, default=None, - help="Configation of SpeculativeConfig.", + help="Configuration of SpeculativeConfig.", ) parser.add_argument( "--max_num_batched_tokens", @@ -542,7 +542,7 @@ def parse_args(): "--quantization", type=str, default="None", - help="Quantization name for the model, currentlly support " + help="Quantization name for the model, currently support " "'wint4', 'wint8'," "default is None. The priority of this configuration " "is lower than that of the config file. " @@ -552,7 +552,7 @@ def parse_args(): "--graph_optimization_config", type=json.loads, default=None, - help="Configation of Graph optimization backend.", + help="Configuration of Graph optimization backend.", ) parser.add_argument( "--moba_attention_config", diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 81bb581a4..9de95aa87 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -50,7 +50,7 @@ class XpuWorker(WorkerBase): def init_device(self): """Initialize device and Construct model runner""" if paddle.is_compiled_with_xpu(): - # Set evironment variable + # Set environment variable self.device = f"xpu:{self.local_rank}" paddle.device.set_device(self.device) paddle.set_default_dtype(self.parallel_config.dtype) diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py index 82b8a27ac..1b5039ade 100644 --- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py +++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py @@ -107,7 +107,7 @@ class TestModel1(paddle.nn.Layer): sub_meta1 = forward_meta sublayer1_output = self.sublayer1(ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1) - # sublayer2 not use cuda garph + # sublayer2 not use cuda graph sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output) sublayer2_output = self.sublayer2(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2) self.sublayer2_output_buffer.copy_(sublayer2_output, False) @@ -131,7 +131,7 @@ class TestModel1(paddle.nn.Layer): ids_remove_padding=ids_remove_padding, forward_meta=sub_meta1 ) - # sublayer2 not use cuda garph + # sublayer2 not use cuda graph sub_meta2 = ForwardMeta(input_ids=sublayer1_output, ids_remove_padding=sublayer1_output) sublayer2_output = self.sublayer2.forward_correct(ids_remove_padding=sublayer1_output, forward_meta=sub_meta2)